In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
''Pract1 - . To use PCA Algorithm for dimensionality reduction.
You have a dataset that includes measurements for different variables on wine
(alcohol, ash, magnesium, and so on). Apply PCA algorithm & transform this data
so that most variations in the measurements of the variables are captured by a small
number of principal components so that it is easier to distinguish between red and
white wine by inspecting these principal components''
In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
df= pd.read_csv('Wine.csv')
In [7]:
df.describe()
Out[7]:
Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue OD280 Proline Customer_Segment
count 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000
mean 13.000618 2.336348 2.366517 19.494944 99.741573 2.295112 2.029270 0.361854 1.590899 5.058090 0.957449 2.611685 746.893258 1.938202
std 0.811827 1.117146 0.274344 3.339564 14.282484 0.625851 0.998859 0.124453 0.572359 2.318286 0.228572 0.709990 314.907474 0.775035
min 11.030000 0.740000 1.360000 10.600000 70.000000 0.980000 0.340000 0.130000 0.410000 1.280000 0.480000 1.270000 278.000000 1.000000
25% 12.362500 1.602500 2.210000 17.200000 88.000000 1.742500 1.205000 0.270000 1.250000 3.220000 0.782500 1.937500 500.500000 1.000000
50% 13.050000 1.865000 2.360000 19.500000 98.000000 2.355000 2.135000 0.340000 1.555000 4.690000 0.965000 2.780000 673.500000 2.000000
75% 13.677500 3.082500 2.557500 21.500000 107.000000 2.800000 2.875000 0.437500 1.950000 6.200000 1.120000 3.170000 985.000000 3.000000
max 14.830000 5.800000 3.230000 30.000000 162.000000 3.880000 5.080000 0.660000 3.580000 13.000000 1.710000 4.000000 1680.000000 3.000000
In [8]:
df.head()
Out[8]:
Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue OD280 Proline Customer_Segment
0 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065 1
1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050 1
2 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185 1
3 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480 1
4 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735 1
In [12]:
df.columns
Out[12]:
Index(['Alcohol', 'Malic_Acid', 'Ash', 'Ash_Alcanity', 'Magnesium',
       'Total_Phenols', 'Flavanoids', 'Nonflavanoid_Phenols',
       'Proanthocyanins', 'Color_Intensity', 'Hue', 'OD280', 'Proline',
       'Customer_Segment'],
      dtype='object')
In [14]:
# Separate features (X) and target (y)
X = df.drop(columns=['Customer_Segment'])  # Drop the target column
y = df['Customer_Segment']  # Set the target variable
In [15]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
In [16]:
# Apply PCA and reduce dimensions to 2 principal components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
In [17]:
# Create a DataFrame with the principal components
pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
pca_df['Customer_Segment'] = y
In [18]:
# Print explained variance ratio
print("Explained variance ratio:", pca.explained_variance_ratio_)
Explained variance ratio: [0.36198848 0.1920749 ]
In [19]:
# Visualize the PCA result
plt.figure(figsize=(8,6))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='Customer_Segment', palette='Set1')
plt.title('PCA of Wine Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
#OR 
In [20]:
import pandas as pd
from sklearn.datasets import load_wine

# Load the wine dataset
wine = load_wine()
wine_df = pd.DataFrame(wine.data, columns=wine.feature_names)
wine_df['wine_type'] = wine.target  # 0, 1, or 2 corresponding to different types of wine

# Inspect the dataset
print(wine_df.head())

from sklearn.preprocessing import StandardScaler

# Separate features and target
X = wine_df.drop(columns=['wine_type'])
y = wine_df['wine_type']

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

from sklearn.decomposition import PCA

# Apply PCA
pca = PCA(n_components=2)  # Reduce to 2 principal components
X_pca = pca.fit_transform(X_scaled)

# Create a DataFrame with the two principal components
pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
pca_df['wine_type'] = y

import matplotlib.pyplot as plt
import seaborn as sns

# Plot the principal components
plt.figure(figsize=(8,6))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='wine_type', palette='Set1')
plt.title('PCA of Wine Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()
   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_wines  proline  wine_type  
0                          3.92   1065.0          0  
1                          3.40   1050.0          0  
2                          3.17   1185.0          0  
3                          3.45   1480.0          0  
4                          2.93    735.0          0  
No description has been provided for this image
In [ ]:
#OR 
In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load your wine dataset
df = pd.read_csv('Wine.csv')

# Separating features and target variable
X = df.drop(columns=['Customer_Segment'])  # Features (measurements)
y = df['Customer_Segment']  # Target variable (Type of wine)

# Scatter plot before applying PCA
plt.figure(figsize=(10, 6))
plt.scatter(X[y == 1]['Alcohol'], X[y == 1]['Malic_Acid'], label='Customer Segment 1', alpha=0.7)
plt.scatter(X[y == 2]['Alcohol'], X[y == 2]['Malic_Acid'], label='Customer Segment 2', alpha=0.7)
plt.scatter(X[y == 3]['Alcohol'], X[y == 3]['Malic_Acid'], label='Customer Segment 3', alpha=0.7)
plt.xlabel('Alcohol')
plt.ylabel('Malic Acid')
plt.legend()
plt.title('Scatter Plot (Original Data)')
plt.show()

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Create a new DataFrame with the first two principal components
pca_df = pd.DataFrame(data=X_pca, columns=['Principal Component 1', 'Principal Component 2'])
pca_df['Customer_Segment'] = y

# Visualize the data using the first two principal components
plt.figure(figsize=(10, 6))
plt.scatter(pca_df[pca_df['Customer_Segment'] == 1]['Principal Component 1'], pca_df[pca_df['Customer_Segment'] == 1]['Principal Component 2'], label='Customer Segment 1', alpha=0.7)
plt.scatter(pca_df[pca_df['Customer_Segment'] == 2]['Principal Component 1'], pca_df[pca_df['Customer_Segment'] == 2]['Principal Component 2'], label='Customer Segment 2', alpha=0.7)
plt.scatter(pca_df[pca_df['Customer_Segment'] == 3]['Principal Component 1'], pca_df[pca_df['Customer_Segment'] == 3]['Principal Component 2'], label='Customer Segment 3', alpha=0.7)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.title('PCA: Wine Dataset')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [ ]:
#OR 
In [4]:
'''Determining Principle Component Analysis for Wine Dataset'''

# Importing Preliminary Libraries

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import linear_model 
#from sklearn import linear_model.fit
from sklearn.linear_model import LinearRegression 
from sklearn.decomposition import PCA 
from sklearn.decomposition import FactorAnalysis
!pip install factor_analyzer
from factor_analyzer import FactorAnalyzer
!pip install factor_analyzer



df = pd.read_csv('Wine.csv')

'''Determing only first 5 values '''
df.head()

# Taking only the dependent value from the dataset

df2 = df[['Alcohol', 'Malic_Acid', 'Ash', 'Ash_Alcanity', 'Magnesium', 'Total_Phenols', 'Flavanoids', 'Nonflavanoid_Phenols', 'Proanthocyanins', 'Color_Intensity', 'Hue', 'OD280', 'Proline']]
df2.head()


#PCA cluster plot for Wine Dataset
#Importing libraries from SKLEARN

import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.decomposition import PCA


#loading Wine dataset
wine = datasets.load_wine()

X = wine.data
y = wine.target
target_names = wine.target_names

pca = PCA(n_components=13)
wine_X = pca.fit(X).transform(X)

#1st Plot
plt.scatter(wine_X[y == 0, 3], wine_X[y == 0, 0], s =80, c = 'orange', label = 'Type 0')
plt.scatter(wine_X[y == 1, 3], wine_X[y == 1, 0], s =80,  c = 'yellow', label = 'Type 1')
plt.scatter(wine_X[y == 2, 3], wine_X[y == 2, 0], s =80,  c = 'green', label = 'Type 2')
plt.title('PCA plot for Wine Dataset')
plt.legend()

#2nd Plot

import pandas as pd

wine_dataframe = pd.DataFrame(wine_X, columns=wine.feature_names)

# Create a scatter matrix from the dataframe, color by y_train
grr = pd.plotting.scatter_matrix(wine_dataframe, c=y, figsize=(15, 15), marker='o',
                                 hist_kwds={'bins': 20}, s=60, alpha=.8)


'''KNN classifier which is a type of supervised Machine Learning Technique. 
This is used to detect the accuracy and classification  of the given dataset'''

# Importing Libraries for Modelling.
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


# Assigning values of X and y from dataset

X, y = wine.data[:, :], wine.target  
''' Here X is assigned as all the column data(SepalLengthCm, SepalWidthCm, PetalLengthCm, PetalWidthCm) and
y is assigned as Species value(Iris-setosa, Iris-versicolour, Iris-virginica)  ))'''

#Setting training and testing values

Xtrain, Xtest, y_train, y_test = train_test_split(X, y)
scaler = preprocessing.StandardScaler().fit(Xtrain)
Xtrain = scaler.transform(Xtrain)
Xtest = scaler.transform(Xtest)

# Modeling is done using KNN classifiers.
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(Xtrain, y_train)
y_pred = knn.predict(Xtest)


# Display the Output

print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Confusion matrix \n',  confusion_matrix(y_test, y_pred))
print('Classification \n', classification_report(y_test, y_pred))


from sklearn.metrics import cohen_kappa_score
cluster = cohen_kappa_score(y_test, y_pred)
cluster

X, y = wine.data[:, :], wine.target  
Xtrain, Xtest, y_train, y_test = train_test_split(X, y)


#Logistic Regression Accuracy
#Logistic Regression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(Xtrain,y_train)
y_pred = classifier.predict(Xtest)
cm = confusion_matrix(y_test,y_pred)
accuracy = accuracy_score(y_test,y_pred)
print("Logistic Regression :")
print("Accuracy = ", accuracy)
print(cm)

#Cohen Kappa Accuracy for LR
from sklearn.metrics import cohen_kappa_score
cluster = cohen_kappa_score(y_test, y_pred)
cluster
Collecting factor_analyzer
  Downloading factor_analyzer-0.5.1.tar.gz (42 kB)
     ---------------------------------------- 0.0/42.8 kB ? eta -:--:--
     --------- ------------------------------ 10.2/42.8 kB ? eta -:--:--
     --------------------------- ---------- 30.7/42.8 kB 325.1 kB/s eta 0:00:01
     -------------------------------------- 42.8/42.8 kB 417.7 kB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Requirement already satisfied: pandas in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from factor_analyzer) (2.1.4)
Requirement already satisfied: scipy in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from factor_analyzer) (1.11.4)
Requirement already satisfied: numpy in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from factor_analyzer) (1.26.3)
Requirement already satisfied: scikit-learn in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from factor_analyzer) (1.4.2)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pandas->factor_analyzer) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pandas->factor_analyzer) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pandas->factor_analyzer) (2023.4)
Requirement already satisfied: joblib>=1.2.0 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from scikit-learn->factor_analyzer) (1.4.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from scikit-learn->factor_analyzer) (3.2.0)
Requirement already satisfied: six>=1.5 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from python-dateutil>=2.8.2->pandas->factor_analyzer) (1.16.0)
Building wheels for collected packages: factor_analyzer
  Building wheel for factor_analyzer (pyproject.toml): started
  Building wheel for factor_analyzer (pyproject.toml): finished with status 'done'
  Created wheel for factor_analyzer: filename=factor_analyzer-0.5.1-py2.py3-none-any.whl size=42623 sha256=0999ccdf3ec64fa15f9ccc4d8bde5e04051197c912b35434ee7ac2f459dc46c8
  Stored in directory: c:\users\lenovo\appdata\local\pip\cache\wheels\a2\af\06\f4d4ed4d9d714fda437fb1583629417319603c2266e7b233cc
Successfully built factor_analyzer
Installing collected packages: factor_analyzer
Successfully installed factor_analyzer-0.5.1
[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
Requirement already satisfied: factor_analyzer in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (0.5.1)
Requirement already satisfied: pandas in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from factor_analyzer) (2.1.4)
Requirement already satisfied: scipy in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from factor_analyzer) (1.11.4)
Requirement already satisfied: numpy in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from factor_analyzer) (1.26.3)
Requirement already satisfied: scikit-learn in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from factor_analyzer) (1.4.2)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pandas->factor_analyzer) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pandas->factor_analyzer) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pandas->factor_analyzer) (2023.4)
Requirement already satisfied: joblib>=1.2.0 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from scikit-learn->factor_analyzer) (1.4.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from scikit-learn->factor_analyzer) (3.2.0)
Requirement already satisfied: six>=1.5 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from python-dateutil>=2.8.2->pandas->factor_analyzer) (1.16.0)
[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
Accuracy Score: 0.9777777777777777
Confusion matrix 
 [[17  0  0]
 [ 0 15  0]
 [ 0  1 12]]
Classification 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        17
           1       0.94      1.00      0.97        15
           2       1.00      0.92      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.97      0.98        45
weighted avg       0.98      0.98      0.98        45

Logistic Regression :
Accuracy =  0.9333333333333333
[[14  1  0]
 [ 0 16  2]
 [ 0  0 12]]
C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Out[4]:
0.8993288590604027
No description has been provided for this image
No description has been provided for this image
In [ ]:
 
In [ ]:
#OR  
In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
%matplotlib inline

# Importing Dataset
df = pd.read_csv('Wine.csv')
df.head(10)

# Describe the dataset using graph - Plot 1
df.iloc[:, 1:].describe()

for c in df.columns[1:]:
    df.boxplot(c, by='Customer_Segment', figsize=(7, 4), fontsize=14)
    plt.title("{}\n".format(c), fontsize=16)
    plt.xlabel("Wine Customer Segment", fontsize=16)

# Using Standard Scaler
scaler = StandardScaler()
X = df.drop('Customer_Segment', axis=1)
y = df['Customer_Segment']
X = scaler.fit_transform(X)
dfx = pd.DataFrame(data=X, columns=df.columns[1:])
dfx.head(10)

# Plot 2
dfx.describe()
from sklearn.decomposition import PCA
pca = PCA(n_components=None)

dfx_pca = pca.fit(dfx)
plt.figure(figsize=(10, 6))
plt.scatter(x=[i + 1 for i in range(len(dfx_pca.explained_variance_ratio_))],
            y=dfx_pca.explained_variance_ratio_,
            s=200, alpha=0.75, c='orange', edgecolor='k')
plt.grid(True)
plt.title("Explained variance ratio of the \nfitted principal component vector\n", fontsize=25)
plt.xlabel("Principal components", fontsize=15)
plt.xticks([i + 1 for i in range(len(dfx_pca.explained_variance_ratio_))], fontsize=15)
plt.yticks(fontsize=15)
plt.ylabel("Explained variance ratio", fontsize=15)
plt.show()

# Transform
dfx_trans = pca.transform(dfx)
dfx_trans = pd.DataFrame(data=dfx_trans)
dfx_trans.head(10)

# Plot 3
plt.figure(figsize=(10, 6))
plt.scatter(dfx_trans[0], dfx_trans[1], c=df['Customer_Segment'], edgecolors='k', alpha=0.75, s=150)
plt.grid(True)
plt.title("Customer Segment separation using first two principal components\n", fontsize=20)
plt.xlabel("Principal component-1", fontsize=15)
plt.ylabel("Principal component-2", fontsize=15)
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
 
In [ ]:
#OR 
'''B. Apply LDA Algorithm on Iris Dataset and classify which species a given flower
belongs to.
Dataset Link:https://www.kaggle.com/datasets/uciml/iri
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.datasets import load_iris
%matplotlib inline


# Load the Iris dataset
iris = load_iris()
X = iris.data  # Features: Sepal length, Sepal width, Petal length, Petal width
y = iris.target  # Target: Species

# Convert the dataset into a DataFrame for easy visualization and manipulation
df = pd.DataFrame(data=X, columns=iris.feature_names)
df['Species'] = y
df['Species'] = df['Species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
df.head()


# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



# Initialize the LDA model
lda = LinearDiscriminantAnalysis()

# Fit the model to the training data
lda.fit(X_train, y_train)

# Predict the species for the test set
y_pred = lda.predict(X_test)


# Accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=iris.target_names))


# Plot the LDA-transformed features to visualize class separability
X_lda = lda.transform(X)

plt.figure(figsize=(10, 6))
colors = ['red', 'green', 'blue']
for i, color, target_name in zip([0, 1, 2], colors, iris.target_names):
    plt.scatter(X_lda[y == i, 0], X_lda[y == i, 1], alpha=0.8, color=color, label=target_name)

plt.xlabel("LDA Component 1")
plt.ylabel("LDA Component 2")
plt.legend(loc='best')
plt.title("LDA Projection of Iris Dataset")
plt.show()
Accuracy: 100.00%
Confusion Matrix:
 [[19  0  0]
 [ 0 13  0]
 [ 0  0 13]]
Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        19
  versicolor       1.00      1.00      1.00        13
   virginica       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

No description has been provided for this image
In [ ]:
 
In [ ]:
#OR 
In [ ]:
 
In [2]:
# importing required packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn import datasets
import matplotlib.pyplot as plt

# load iris dataset
iris=datasets.load_iris()

# convert dataset into a pandas dataframe
df = pd.DataFrame(data = np.c_[iris['data'], iris['target']],
                 columns = iris['feature_names'] + ['target'])
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
df.columns = ['s_length', 's_width', 'p_length', 'p_width', 'target', 'species']

df.head()

# fitting the model
X = df[['s_length', 's_width', 'p_length', 'p_width']]
y = df['species']

model = LinearDiscriminantAnalysis()
model.fit(X, y)


# evaluating the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores))


X = iris.data
y = iris.target
model = LinearDiscriminantAnalysis()
data_plot = model.fit(X, y).transform(X)
target_names = iris.target_names


plt.figure()
colors = ['red', 'green', 'blue']
lw = 2
for color, i, target_name in zip(colors, [0, 1, 2], target_names):
    plt.scatter(data_plot[y == i, 0], data_plot[y == i, 1], alpha=.8, color=color,
                label=target_name)

plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.show()


# define new observation
new = [5, 2, 1, .4]

# predict which class the new observation belongs to
model.predict([new])
0.9800000000000001
No description has been provided for this image
Out[2]:
array([0])
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
''' Pract 2- B. Use the diabetes data set from UCI and Pima Indians Diabetes data set for performing
the following:
a. Univariate analysis: Frequency, Mean, Median, Mode, Variance, Standard
Deviation, Skewness and Kurtosis
b. Bivariate analysis: Linear and logistic regression modeling
c. Multiple Regression analysis
d. Also compare the results of the above analysis for the two data sets'''
In [8]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.linear_model import LinearRegression, LogisticRegression
!pip install statsmodels
import statsmodels.api as sm

# Load the dataset (make sure the file is in your working directory)
df = pd.read_csv('diabetes.csv')

# Display first few rows of the dataset
df.head()

#part 1- univarate analysis 
# Define a function for univariate statistics
def univariate_analysis(df):
    univariate_stats = {
        'Feature': [],
        'Mean': [],
        'Median': [],
        'Mode': [],
        'Variance': [],
        'Standard Deviation': [],
        'Skewness': [],
        'Kurtosis': []
    }
    
    for column in df.columns:
        if np.issubdtype(df[column].dtype, np.number):
            univariate_stats['Feature'].append(column)
            univariate_stats['Mean'].append(df[column].mean())
            univariate_stats['Median'].append(df[column].median())
            univariate_stats['Mode'].append(df[column].mode()[0])
            univariate_stats['Variance'].append(df[column].var())
            univariate_stats['Standard Deviation'].append(df[column].std())
            univariate_stats['Skewness'].append(df[column].skew())
            univariate_stats['Kurtosis'].append(df[column].kurt())
    
    return pd.DataFrame(univariate_stats)

# Univariate analysis for the dataset
univariate_results = univariate_analysis(df)
print("Univariate Analysis Results:\n", univariate_results)


#part 2 - bivarite analysis 

# Linear regression: Predicting BloodPressure from BMI
X = df[['BMI']]  # predictor
y = df['BloodPressure']  # target

linear_model = LinearRegression()
linear_model.fit(X, y)
predictions = linear_model.predict(X)

# Plot the regression line
plt.figure(figsize=(10, 6))
plt.scatter(X, y, color='blue', label='Actual data')
plt.plot(X, predictions, color='red', label='Regression line')
plt.xlabel('BMI')
plt.ylabel('Blood Pressure')
plt.title('Linear Regression: Predicting Blood Pressure from BMI')
plt.legend()
plt.show()


# logistic regression 

# Logistic regression: Predicting Outcome from Age
X_logistic = df[['Age']]  # predictor
y_logistic = df['Outcome']  # target (0 or 1)

logistic_model = LogisticRegression()
logistic_model.fit(X_logistic, y_logistic)
logistic_pred = logistic_model.predict(X_logistic)

# Model summary
print("Logistic Regression Coefficients:\n", logistic_model.coef_)
print("Intercept:", logistic_model.intercept_)


# part 3 - multi regression analysis 

# Multiple regression to predict Outcome using multiple predictors
X_multi = df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']]
y_multi = df['Outcome']

multi_logit_model = sm.Logit(y_multi, sm.add_constant(X_multi)).fit()
print(multi_logit_model.summary())
Collecting statsmodels
  Downloading statsmodels-0.14.4-cp312-cp312-win_amd64.whl.metadata (9.5 kB)
Requirement already satisfied: numpy<3,>=1.22.3 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from statsmodels) (1.26.3)
Requirement already satisfied: scipy!=1.9.2,>=1.8 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from statsmodels) (1.11.4)
Requirement already satisfied: pandas!=2.1.0,>=1.4 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from statsmodels) (2.1.4)
Collecting patsy>=0.5.6 (from statsmodels)
  Downloading patsy-0.5.6-py2.py3-none-any.whl.metadata (3.5 kB)
Requirement already satisfied: packaging>=21.3 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from statsmodels) (23.2)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2023.4)
Requirement already satisfied: six in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from patsy>=0.5.6->statsmodels) (1.16.0)
Downloading statsmodels-0.14.4-cp312-cp312-win_amd64.whl (9.8 MB)
   ---------------------------------------- 0.0/9.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.8 MB 682.7 kB/s eta 0:00:15
    --------------------------------------- 0.1/9.8 MB 1.6 MB/s eta 0:00:07
   -- ------------------------------------- 0.7/9.8 MB 5.7 MB/s eta 0:00:02
   ------- -------------------------------- 1.7/9.8 MB 11.0 MB/s eta 0:00:01
   -------- ------------------------------- 2.0/9.8 MB 9.7 MB/s eta 0:00:01
   --------- ------------------------------ 2.3/9.8 MB 9.0 MB/s eta 0:00:01
   ---------- ----------------------------- 2.7/9.8 MB 9.0 MB/s eta 0:00:01
   ----------- ---------------------------- 2.9/9.8 MB 8.5 MB/s eta 0:00:01
   ------------ --------------------------- 3.2/9.8 MB 8.1 MB/s eta 0:00:01
   -------------- ------------------------- 3.5/9.8 MB 8.0 MB/s eta 0:00:01
   --------------- ------------------------ 3.8/9.8 MB 7.8 MB/s eta 0:00:01
   ---------------- ----------------------- 4.1/9.8 MB 7.7 MB/s eta 0:00:01
   ----------------- ---------------------- 4.3/9.8 MB 7.4 MB/s eta 0:00:01
   ----------------- ---------------------- 4.4/9.8 MB 7.0 MB/s eta 0:00:01
   ------------------ --------------------- 4.5/9.8 MB 6.7 MB/s eta 0:00:01
   ------------------ --------------------- 4.6/9.8 MB 6.4 MB/s eta 0:00:01
   ------------------- -------------------- 4.7/9.8 MB 6.2 MB/s eta 0:00:01
   ------------------- -------------------- 4.9/9.8 MB 6.0 MB/s eta 0:00:01
   -------------------- ------------------- 5.0/9.8 MB 5.8 MB/s eta 0:00:01
   --------------------- ------------------ 5.2/9.8 MB 5.7 MB/s eta 0:00:01
   --------------------- ------------------ 5.3/9.8 MB 5.6 MB/s eta 0:00:01
   ---------------------- ----------------- 5.5/9.8 MB 5.5 MB/s eta 0:00:01
   ----------------------- ---------------- 5.7/9.8 MB 5.4 MB/s eta 0:00:01
   ----------------------- ---------------- 5.8/9.8 MB 5.3 MB/s eta 0:00:01
   ------------------------ --------------- 6.0/9.8 MB 5.3 MB/s eta 0:00:01
   ------------------------- -------------- 6.2/9.8 MB 5.2 MB/s eta 0:00:01
   -------------------------- ------------- 6.4/9.8 MB 5.2 MB/s eta 0:00:01
   --------------------------- ------------ 6.7/9.8 MB 5.2 MB/s eta 0:00:01
   ---------------------------- ----------- 6.9/9.8 MB 5.2 MB/s eta 0:00:01
   ----------------------------- ---------- 7.2/9.8 MB 5.3 MB/s eta 0:00:01
   ------------------------------ --------- 7.5/9.8 MB 5.3 MB/s eta 0:00:01
   ------------------------------- -------- 7.9/9.8 MB 5.3 MB/s eta 0:00:01
   --------------------------------- ------ 8.2/9.8 MB 5.4 MB/s eta 0:00:01
   ---------------------------------- ----- 8.6/9.8 MB 5.5 MB/s eta 0:00:01
   ------------------------------------ --- 8.9/9.8 MB 5.5 MB/s eta 0:00:01
   ------------------------------------- -- 9.1/9.8 MB 5.6 MB/s eta 0:00:01
   ------------------------------------- -- 9.1/9.8 MB 5.6 MB/s eta 0:00:01
   ------------------------------------- -- 9.3/9.8 MB 5.3 MB/s eta 0:00:01
   ---------------------------------------  9.6/9.8 MB 5.4 MB/s eta 0:00:01
   ---------------------------------------  9.8/9.8 MB 5.4 MB/s eta 0:00:01
   ---------------------------------------- 9.8/9.8 MB 5.3 MB/s eta 0:00:00
Downloading patsy-0.5.6-py2.py3-none-any.whl (233 kB)
   ---------------------------------------- 0.0/233.9 kB ? eta -:--:--
   --------------- ------------------------ 92.2/233.9 kB 5.1 MB/s eta 0:00:01
   ---------------------------------------- 233.9/233.9 kB 2.9 MB/s eta 0:00:00
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.6 statsmodels-0.14.4
[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
Univariate Analysis Results:
                     Feature        Mean    Median    Mode      Variance  \
0               Pregnancies    3.845052    3.0000   1.000     11.354056   
1                   Glucose  120.894531  117.0000  99.000   1022.248314   
2             BloodPressure   69.105469   72.0000  70.000    374.647271   
3             SkinThickness   20.536458   23.0000   0.000    254.473245   
4                   Insulin   79.799479   30.5000   0.000  13281.180078   
5                       BMI   31.992578   32.0000  32.000     62.159984   
6  DiabetesPedigreeFunction    0.471876    0.3725   0.254      0.109779   
7                       Age   33.240885   29.0000  22.000    138.303046   
8                   Outcome    0.348958    0.0000   0.000      0.227483   

   Standard Deviation  Skewness  Kurtosis  
0            3.369578  0.901674  0.159220  
1           31.972618  0.173754  0.640780  
2           19.355807 -1.843608  5.180157  
3           15.952218  0.109372 -0.520072  
4          115.244002  2.272251  7.214260  
5            7.884160 -0.428982  3.290443  
6            0.331329  1.919911  5.594954  
7           11.760232  1.129597  0.643159  
8            0.476951  0.635017 -1.600930  
No description has been provided for this image
Logistic Regression Coefficients:
 [[0.04202455]]
Intercept: [-2.04744807]
Optimization terminated successfully.
         Current function value: 0.470993
         Iterations 6
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                Outcome   No. Observations:                  768
Model:                          Logit   Df Residuals:                      759
Method:                           MLE   Df Model:                            8
Date:                Sat, 02 Nov 2024   Pseudo R-squ.:                  0.2718
Time:                        00:04:47   Log-Likelihood:                -361.72
converged:                       True   LL-Null:                       -496.74
Covariance Type:            nonrobust   LLR p-value:                 9.652e-54
============================================================================================
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                       -8.4047      0.717    -11.728      0.000      -9.809      -7.000
Pregnancies                  0.1232      0.032      3.840      0.000       0.060       0.186
Glucose                      0.0352      0.004      9.481      0.000       0.028       0.042
BloodPressure               -0.0133      0.005     -2.540      0.011      -0.024      -0.003
SkinThickness                0.0006      0.007      0.090      0.929      -0.013       0.014
Insulin                     -0.0012      0.001     -1.322      0.186      -0.003       0.001
BMI                          0.0897      0.015      5.945      0.000       0.060       0.119
DiabetesPedigreeFunction     0.9452      0.299      3.160      0.002       0.359       1.531
Age                          0.0149      0.009      1.593      0.111      -0.003       0.033
============================================================================================
In [ ]:
 
In [ ]:
 # OR 
'''
A. Predict the price of the Uber ride from a given pickup point to the agreed drop-off
location. Perform following tasks:
1. Pre-process the dataset.
2. Identify outliers.
3. Check the correlation.
4. Implement linear regression and ridge, Lasso regression models.
5. Evaluate the models and compare their respective scores like R2, RMSE, etc.
Dataset link: https://www.kaggle.com/datasets/yasserh/uber-fares-datase
In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.impute import SimpleImputer


# Load the dataset
df = pd.read_csv("uber.csv")

# view dataset
df.head()

df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
# print(df['pickup_datetime'])
df['hour'] = df['pickup_datetime'].dt.hour
# print(df['hour'])
df['day_of_week'] = df['pickup_datetime'].dt.dayofweek
# print(df['day_of_week'])

# check datasets for more columns we added 'hour' and 'day_of_week' column
df.head()

# Drop unnecessary columns
df = df.drop(columns=['Unnamed: 0', 'key', 'pickup_datetime'])

# check datasets for removal of columns we removed 'first_column with no name', 'key' and 'pickup_datetime' column
df.head()

# Handle missing values
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Split the data into features (X) and target (y)
X = df_imputed.drop(columns=['fare_amount'])  # create new dataset ignoring 'fare_amount' column
y = df_imputed['fare_amount']  # create a series of only 'fare_amount' column

 #Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Implement Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)

# Implement Ridge Regression
ridge_model = Ridge(alpha=1.0)  # You can experiment with different alpha values
ridge_model.fit(X_train_scaled, y_train)
y_pred_ridge = ridge_model.predict(X_test_scaled)

# Implement Lasso Regression
lasso_model = Lasso(alpha=0.1)  # You can experiment with different alpha values
lasso_model.fit(X_train_scaled, y_train)
y_pred_lasso = lasso_model.predict(X_test_scaled)

# Evaluate the models
def evaluate_model(y_true, y_pred, model_name):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"{model_name} - R2 Score: {r2:.4f}, RMSE: {rmse:.2f}")

evaluate_model(y_test, y_pred_lr, "Linear Regression")
evaluate_model(y_test, y_pred_ridge, "Ridge Regression")
evaluate_model(y_test, y_pred_lasso, "Lasso Regression")
Linear Regression - R2 Score: 0.0007, RMSE: 10.31
Ridge Regression - R2 Score: 0.0007, RMSE: 10.31
Lasso Regression - R2 Score: 0.0003, RMSE: 10.31
In [ ]:
 
In [ ]:
#OR
In [11]:
# Load the required packages 
import numpy as np 
import pandas as pd 
from pandas import read_csv

# Specify the file name 
filename = 'diabetes.csv'

# Read the data 
data = read_csv(filename) 

# Print the shape 
data.shape

# Print the first few rows 
data.head()

# Show the type of 'data'
type(data) 

# Get the column names 
col_idx = data.columns
col_idx

# Get row indices 
row_idx = data.index
print(row_idx)

# Find data type for each attribute 
print("Data type of each attribute:")
data.dtypes

# Generate statistical summary 
description = data.describe()
print("Statistical summary of the data:\n")
description

#Therefore, there are a total of 768 entries in the dataset. The outcome variable is set to 1 for 268 entries, and the rest are set to 0.
class_counts = data.groupby('Outcome').size() 
print("Class breakdown of the data:\n")
print(class_counts)

# Compute correlation matrix 
correlations = data.corr(method = 'pearson') 
print("Correlations of attributes in the data:\n") 
correlations

skew = data.skew() 
print("Skew of attribute distributions in the data:\n") 
skew

#histogram
# Import required package 
from matplotlib import pyplot 

pyplot.rcParams['figure.figsize'] = [20, 10]; # set the figure size 

# Draw histograms for all attributes 
data.hist()
pyplot.show()

# Density plots for all attributes
data.plot(kind='density', subplots=True, layout=(3,3), sharex=False)
pyplot.show() 

#box plot
# Draw box and whisker plots for all attributes 
data.plot(kind= 'box', subplots=True, layout=(3,3), sharex=False, sharey=False)
pyplot.show()

#multivarate plots 
# Compute the correlation matrix 
correlations = data.corr(method = 'pearson') # Correlations between all pairs of attributes
# Print the datatype 
type(correlations)
# Show the correlation matrix 
correlations

# import required package 
import numpy as np 

# plot correlation matrix
fig = pyplot.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,9,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
names = data.columns
ax.set_xticklabels(names,rotation=90) # Rotate x-tick labels by 90 degrees 
ax.set_yticklabels(names)
pyplot.show()

# Import required package 
from pandas.plotting import scatter_matrix
pyplot.rcParams['figure.figsize'] = [20, 20]

# Plotting Scatterplot Matrix
scatter_matrix(data)
pyplot.show()
RangeIndex(start=0, stop=768, step=1)
Data type of each attribute:
Statistical summary of the data:

Class breakdown of the data:

Outcome
0    500
1    268
dtype: int64
Correlations of attributes in the data:

Skew of attribute distributions in the data:

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
 
In [ ]:
''' Pract 3 Classification Analysis (Any one)
A. Implementation of Support Vector Machines (SVM) for classifying images of handwritten digits into their respective numerical classes (0 to 9)
In [4]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics 
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics


# Load the digits dataset
digits = datasets.load_digits()

# Split the data into features (X) and labels (y)
X = digits.data
y = digits.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Create an SVM classifier (linear kernel)
clf = svm.SVC(kernel='linear')

# Fit the classifier on the training data
clf.fit(X_train, y_train)

#predict the test data
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy : ", accuracy)

# Confusion matrix
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print("Confusion Matrix : ")
print(confusion_matrix)

# Classification report
classification_report = metrics.classification_report(y_test, y_pred)
print("Classification Report : ")
print(classification_report)


# Visualize some of the test images and their predicted labels
plt.figure(figsize=(15, 8))
for i in range(10):
    plt.subplot(5, 5, i + 1)
    plt.imshow(X_test[i].reshape(8, 8), cmap=plt.cm.gray_r)
    plt.title(f"Predicted : {y_pred[i]}, Actual : {y_test[i]}")
    plt.axis('on')
Accuracy :  0.9777777777777777
Confusion Matrix : 
[[33  0  0  0  0  0  0  0  0  0]
 [ 0 28  0  0  0  0  0  0  0  0]
 [ 0  0 33  0  0  0  0  0  0  0]
 [ 0  0  0 32  0  1  0  0  0  1]
 [ 0  1  0  0 45  0  0  0  0  0]
 [ 0  0  0  0  0 47  0  0  0  0]
 [ 0  0  0  0  0  0 35  0  0  0]
 [ 0  0  0  0  0  0  0 33  0  1]
 [ 0  0  0  0  0  1  0  0 29  0]
 [ 0  0  0  1  1  0  0  1  0 37]]
Classification Report : 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        33
           1       0.97      1.00      0.98        28
           2       1.00      1.00      1.00        33
           3       0.97      0.94      0.96        34
           4       0.98      0.98      0.98        46
           5       0.96      1.00      0.98        47
           6       1.00      1.00      1.00        35
           7       0.97      0.97      0.97        34
           8       1.00      0.97      0.98        30
           9       0.95      0.93      0.94        40

    accuracy                           0.98       360
   macro avg       0.98      0.98      0.98       360
weighted avg       0.98      0.98      0.98       360

No description has been provided for this image
In [ ]:
 
In [ ]:
#OR 
In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
%matplotlib inline


# Load the digits dataset
digits = datasets.load_digits()

# Print dataset information
print("Digits dataset keys:", digits.keys())
print("Image data shape:", digits.images.shape)
print("Labels shape:", digits.target.shape)

# Display some sample images with their labels
fig, axes = plt.subplots(1, 5, figsize=(10, 3))
for ax, image, label in zip(axes, digits.images, digits.target):
    ax.set_axis_off()
    ax.imshow(image, cmap='gray')
    ax.set_title(f"Label: {label}")
plt.show()


# Flatten the images for the SVM model
X = digits.images.reshape((len(digits.images), -1))
y = digits.target

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Initialize the SVM model with a linear kernel
svm_model = SVC(kernel='linear', C=1.0)

# Train the model
svm_model.fit(X_train, y_train)


# Predict on the test set
y_pred = svm_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Display the classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Display the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Visualize the confusion matrix
plt.figure(figsize=(10, 6))
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='d')
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()
Digits dataset keys: dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])
Image data shape: (1797, 8, 8)
Labels shape: (1797,)
No description has been provided for this image
Model Accuracy: 97.78%
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        53
           1       0.98      0.98      0.98        50
           2       0.98      1.00      0.99        47
           3       0.96      0.96      0.96        54
           4       1.00      1.00      1.00        60
           5       0.97      0.95      0.96        66
           6       0.98      0.98      0.98        53
           7       1.00      0.98      0.99        55
           8       0.95      0.95      0.95        43
           9       0.97      0.97      0.97        59

    accuracy                           0.98       540
   macro avg       0.98      0.98      0.98       540
weighted avg       0.98      0.98      0.98       540

Confusion Matrix:
 [[53  0  0  0  0  0  0  0  0  0]
 [ 0 49  0  0  0  0  0  0  1  0]
 [ 0  0 47  0  0  0  0  0  0  0]
 [ 0  0  1 52  0  1  0  0  0  0]
 [ 0  0  0  0 60  0  0  0  0  0]
 [ 0  0  0  1  0 63  1  0  0  1]
 [ 1  0  0  0  0  0 52  0  0  0]
 [ 0  0  0  0  0  0  0 54  0  1]
 [ 0  1  0  0  0  1  0  0 41  0]
 [ 0  0  0  1  0  0  0  0  1 57]]
No description has been provided for this image
In [ ]:
 
In [ ]:
'''Pract 3 B  B. Implement K-Nearest Neighbours’ algorithm on Social network ad dataset. Compute
confusion matrix, accuracy, error rate, precision and recall on the given dataset.
In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
import seaborn as sns
%matplotlib inline


# Load the dataset
df = pd.read_csv('Social_Network_Ads.csv')

# Display the first few rows of the dataset
print(df.head())


# Drop unnecessary columns
df = df.drop(['User ID', 'Gender'], axis=1)

# Define features and target variable
X = df[['Age', 'EstimatedSalary']].values
y = df['Purchased'].values


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Initialize the KNN classifier with k=5 (you can tune this value)
knn = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn.fit(X_train, y_train)


# Predict on the test set
y_pred = knn.predict(X_test)


# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
error_rate = 1 - accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Error Rate: {error_rate:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
    User ID  Gender  Age  EstimatedSalary  Purchased
0  15624510    Male   19            19000          0
1  15810944    Male   35            20000          0
2  15668575  Female   26            43000          0
3  15603246  Female   27            57000          0
4  15804002    Male   19            76000          0
No description has been provided for this image
Accuracy: 0.93
Error Rate: 0.07
Precision: 0.91
Recall: 0.91
In [ ]:
 
In [ ]:
#OR
In [8]:
# K-Nearest Neighbors (k-NN)

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


# Importing the dataset
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values


# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


# Fitting the knn classifier to the Training set
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn.fit(X_train, y_train)


# Predicting the Test set results
y_pred = knn.predict(X_test)


# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)



# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, knn.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.5, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], alpha=0.5,
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('K-Nearest Neighbors (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()


# Visualising the Test set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, knn.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.5, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], alpha=0.9,
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('K-Nearest Neighbors (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()


# As you can see in the gifure above, the incorrect predictions using K-NN model is 7 when compared to 11 incorrect predictions 
#using Logistic regression. This concludes that though K-NN is not the best model for this prediction
#it is a better model compared to logistic regression. Cheers!
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_1368\1339922156.py:52: UserWarning: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.
  plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], alpha=0.5,
No description has been provided for this image
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_1368\1339922156.py:71: UserWarning: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.
  plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], alpha=0.9,
No description has been provided for this image
In [ ]:
 
In [ ]:
''' Pract 4 - A - Clustering Analysis (Any one)
A. Implement K-Means clustering on Iris.csv dataset. Determine the number of clusters
using the elbow method
In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
%matplotlib inline


# Load the dataset
df = pd.read_csv('Iris.csv')

# Display the first few rows of the dataset
print(df.head())


# Drop the 'Species' column as it is not needed for clustering
X = df.drop(['Species', 'Id'], axis=1)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# Calculate WCSS for different number of clusters
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss, marker='o', linestyle='-', color='b')
plt.title('Elbow Method to Determine Optimal Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Within-Cluster-Sum-of-Squares (WCSS)')
plt.xticks(range(1, 11))
plt.grid(True)
plt.show()


# Apply K-Means with the chosen number of clusters
optimal_clusters = 3
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
y_kmeans = kmeans.fit_predict(X_scaled)

# Add the cluster labels to the dataset
df['Cluster'] = y_kmeans
print(df.head())


# Plot the clusters
plt.figure(figsize=(10, 6))
plt.scatter(X_scaled[y_kmeans == 0, 2], X_scaled[y_kmeans == 0, 3], s=100, c='red', label='Cluster 1')
plt.scatter(X_scaled[y_kmeans == 1, 2], X_scaled[y_kmeans == 1, 3], s=100, c='blue', label='Cluster 2')
plt.scatter(X_scaled[y_kmeans == 2, 2], X_scaled[y_kmeans == 2, 3], s=100, c='green', label='Cluster 3')

# Plot centroids
plt.scatter(kmeans.cluster_centers_[:, 2], kmeans.cluster_centers_[:, 3], s=300, c='yellow', marker='X', label='Centroids')
plt.title('Clusters of Iris Dataset')
plt.xlabel('Petal Length (Standardized)')
plt.ylabel('Petal Width (Standardized)')
plt.legend()
plt.grid(True)
plt.show()
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa
No description has been provided for this image
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species  \
0   1            5.1           3.5            1.4           0.2  Iris-setosa   
1   2            4.9           3.0            1.4           0.2  Iris-setosa   
2   3            4.7           3.2            1.3           0.2  Iris-setosa   
3   4            4.6           3.1            1.5           0.2  Iris-setosa   
4   5            5.0           3.6            1.4           0.2  Iris-setosa   

   Cluster  
0        1  
1        2  
2        2  
3        2  
4        1  
No description has been provided for this image
In [ ]:
 
In [ ]:
#OR 
In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the Iris dataset
df = pd.read_csv("Iris.csv")

# Select features (attributes) for clustering (e.g., sepal_length, sepal_width, petal_length, petal_width)
X = df.iloc[:, 1:-1]  # Exclude the first column (id) and the last column (species)

# Standardize the feature matrix (important for K-Means)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Determine the optimal number of clusters using the elbow method
inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

print(inertia)


# Plot the elbow curve
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), inertia, marker='o', linestyle='--')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia (Within-cluster Sum of Squares)')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.grid()
plt.show()
[600.0, 223.73200573676345, 192.03717409190028, 114.68221609937967, 91.29544474066981, 81.76026132860622, 80.98238131032987, 68.08623905064636, 59.38528882045366, 52.9899972101586]
No description has been provided for this image
In [ ]:
 
In [ ]:
#OR 
In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Load the iris dataset
iris_df = sns.load_dataset("iris")

# Display value counts for species column
print(iris_df["species"].value_counts())

# Show the first few rows
print(iris_df.head())

# Heatmap of correlations
plt.figure(figsize=(13, 10))
correlation = iris_df.select_dtypes(include=['float64']).corr()  # Only select numeric columns
sns.heatmap(correlation, annot=True, fmt='.2f', cmap="RdPu")
plt.title("Correlation Heatmap")
plt.show()

# Scatter plot of Sepal Length vs Sepal Width
plt.figure(figsize=(10, 7))
sns.set(style='whitegrid')
sns.scatterplot(x="sepal_length", y="sepal_width", hue='species', data=iris_df, palette="inferno")
plt.title("Scatter Plot of Sepal Length vs Sepal Width")
plt.show()

# Joint plot for Sepal Width vs Petal Width
sns.jointplot(x='petal_width', y='sepal_width', hue="species", palette="inferno", data=iris_df)
plt.show()

# Pair plot for all features
sns.pairplot(iris_df, hue="species", palette='gnuplot2')
plt.show()

# Prepare data for K-Means (use only numeric columns for clustering)
X = iris_df[['sepal_length', 'petal_width']].copy()

# Determine the optimal number of clusters using the Elbow Method
distances = []
for i in range(1, 7):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(X)
    distances.append(kmeans.inertia_)

# Plotting the Elbow Curve
plt.figure(figsize=(8, 6))
plt.plot(range(1, 7), distances, marker='o')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

# Apply K-Means with the optimal number of clusters (e.g., 3)
kmeans = KMeans(n_clusters=3, random_state=42)
identified_clusters = kmeans.fit_predict(X)

# Adding the cluster data to the original dataset for plotting
data_with_clusters = X.copy()
data_with_clusters['Cluster'] = identified_clusters

# Scatter plot with clusters
plt.figure(figsize=(10, 10))
scatter = plt.scatter(data_with_clusters['sepal_length'],
                      data_with_clusters['petal_width'],
                      c=data_with_clusters['Cluster'], cmap='winter')
plt.title('Scatter Plot for Sepal Length and Petal Width with Clusters')
plt.xlabel('Sepal Length')
plt.ylabel('Petal Width')
plt.legend(*scatter.legend_elements(), title='Cluster')
plt.show()

# Inference: From the figure, we can say that the optimal number of clusters is where the elbow occurs (K=3).
# With K=3, we can segment the data into 3 different clusters with minimal error.
species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [13]:
# Same as above but with solved error 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn import preprocessing

# Load Iris dataset
iris_df = sns.load_dataset("iris")

# Encode the species labels as numeric values
label_encoder = preprocessing.LabelEncoder()
iris_df['species'] = label_encoder.fit_transform(iris_df['species'])

# Heatmap of correlations
plt.figure(figsize=(13, 10))
correlation = iris_df.corr()
sns.heatmap(correlation, annot=True, fmt='.2f', cmap="RdPu")
plt.title("Correlation Heatmap")
plt.show()

# Scatter plot of Sepal Length vs Sepal Width
plt.figure(figsize=(10, 7))
sns.set(style='whitegrid')
sns.scatterplot(x="sepal_length", y="sepal_width", hue='species', data=iris_df, palette="inferno")
plt.title("Scatter Plot of Sepal Length vs Sepal Width")
plt.show()

# Joint plot for Sepal Width vs Petal Width
sns.jointplot(x='petal_width', y='sepal_width', hue="species", palette="inferno", data=iris_df)
plt.show()

# Pair plot for all features
sns.pairplot(iris_df, hue="species", palette='gnuplot2')
plt.show()

# Prepare data for K-Means
X = iris_df[['sepal_length', 'petal_width']]

# Determine the optimal number of clusters using the Elbow Method
distance = []
for i in range(1, 7):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(X)
    distance.append(kmeans.inertia_)

# Plotting the Elbow Curve
plt.figure(figsize=(8, 6))
plt.plot(range(1, 7), distance, marker='o')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

# Apply K-Means with the optimal number of clusters (e.g., 3)
kmeans = KMeans(n_clusters=3, random_state=42)
identified_clusters = kmeans.fit_predict(X)

# Adding the cluster data to the original dataset for plotting
data_with_clusters = X.copy()
data_with_clusters['Cluster'] = identified_clusters

# Scatter plot with clusters
plt.figure(figsize=(10, 10))
scatter = plt.scatter(data_with_clusters['sepal_length'],
                      data_with_clusters['petal_width'],
                      c=data_with_clusters['Cluster'], cmap='winter')
plt.title('Scatter Plot for Sepal Length and Petal Width with Clusters')
plt.xlabel('Sepal Length')
plt.ylabel('Petal Width')
plt.legend(*scatter.legend_elements(), title='Cluster')
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
 
In [ ]:
'''Pract 4 - B - B. Implement K-Mediod Algorithm on a credit card dataset. Determine the number of
clusters using the Silhouette Method.
In [ ]:
import pandas as pd
import numpy as np
import random
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Load your dataset
data = pd.read_csv('creditcard.csv')  # Update with your actual file path
print(data.head())

# Handle missing values: Drop rows with missing values
data_cleaned = data.dropna()

# Select only numeric features for clustering
numeric_data = data_cleaned.select_dtypes(include=[np.number])

# Standardize the numeric features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_data)

# K-Medoids Algorithm
def k_medoids(X, num_clusters, max_iterations=100):
    m, n = X.shape
    medoids_indices = random.sample(range(m), num_clusters)
    medoids = X[medoids_indices]
    
    for _ in range(max_iterations):
        clusters = np.zeros(m)
        
        # Assign clusters based on the nearest medoid
        for i in range(m):
            distances = np.linalg.norm(X[i] - medoids, axis=1)
            clusters[i] = np.argmin(distances)

        # Update medoids
        new_medoids = np.zeros((num_clusters, n))
        for i in range(num_clusters):
            cluster_points = X[clusters == i]
            if len(cluster_points) > 0:
                distances_in_cluster = np.sum(np.linalg.norm(cluster_points[:, np.newaxis] - cluster_points, axis=2), axis=1)
                new_medoids[i] = cluster_points[np.argmin(distances_in_cluster)]
            else:
                new_medoids[i] = medoids[i]  # Keep the old medoid if the cluster is empty

        medoids = new_medoids
    
    return clusters, medoids

# Determine the optimal number of clusters using the Silhouette Method
silhouette_scores = []
range_n_clusters = range(2, 10)  # Try different cluster sizes
optimal_n_clusters = 0
best_silhouette_score = -1

for n_clusters in range_n_clusters:
    clusters, medoids = k_medoids(scaled_data, n_clusters)
    silhouette_avg = silhouette_score(scaled_data, clusters)
    silhouette_scores.append(silhouette_avg)
    print(f"For n_clusters = {n_clusters}, the silhouette score is: {silhouette_avg}")
    
    # Track the optimal number of clusters
    if silhouette_avg > best_silhouette_score:
        best_silhouette_score = silhouette_avg
        optimal_n_clusters = n_clusters

print(f"\nOptimal number of clusters based on silhouette score is: {optimal_n_clusters}")

# Plotting silhouette scores to visualize the optimal number of clusters
plt.figure(figsize=(10, 6))
plt.plot(range_n_clusters, silhouette_scores, marker='o')
plt.title('Silhouette Scores for K-Medoids')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.xticks(range_n_clusters)
plt.grid()
plt.show()
  CUST_ID      BALANCE  BALANCE_FREQUENCY  PURCHASES  ONEOFF_PURCHASES  \
0  C10001    40.900749           0.818182      95.40              0.00   
1  C10002  3202.467416           0.909091       0.00              0.00   
2  C10003  2495.148862           1.000000     773.17            773.17   
3  C10004  1666.670542           0.636364    1499.00           1499.00   
4  C10005   817.714335           1.000000      16.00             16.00   

   INSTALLMENTS_PURCHASES  CASH_ADVANCE  PURCHASES_FREQUENCY  \
0                    95.4      0.000000             0.166667   
1                     0.0   6442.945483             0.000000   
2                     0.0      0.000000             1.000000   
3                     0.0    205.788017             0.083333   
4                     0.0      0.000000             0.083333   

   ONEOFF_PURCHASES_FREQUENCY  PURCHASES_INSTALLMENTS_FREQUENCY  \
0                    0.000000                          0.083333   
1                    0.000000                          0.000000   
2                    1.000000                          0.000000   
3                    0.083333                          0.000000   
4                    0.083333                          0.000000   

   CASH_ADVANCE_FREQUENCY  CASH_ADVANCE_TRX  PURCHASES_TRX  CREDIT_LIMIT  \
0                0.000000                 0              2        1000.0   
1                0.250000                 4              0        7000.0   
2                0.000000                 0             12        7500.0   
3                0.083333                 1              1        7500.0   
4                0.000000                 0              1        1200.0   

      PAYMENTS  MINIMUM_PAYMENTS  PRC_FULL_PAYMENT  TENURE  
0   201.802084        139.509787          0.000000      12  
1  4103.032597       1072.340217          0.222222      12  
2   622.066742        627.284787          0.000000      12  
3     0.000000               NaN          0.000000      12  
4   678.334763        244.791237          0.000000      12  
In [1]:
!pip install pyclustering
Requirement already satisfied: pyclustering in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (0.10.1.2)
Requirement already satisfied: scipy>=1.1.0 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pyclustering) (1.11.4)
Requirement already satisfied: matplotlib>=3.0.0 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pyclustering) (3.8.2)
Requirement already satisfied: numpy>=1.15.2 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pyclustering) (1.26.3)
Requirement already satisfied: Pillow>=5.2.0 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from pyclustering) (10.2.0)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from matplotlib>=3.0.0->pyclustering) (1.2.0)
Requirement already satisfied: cycler>=0.10 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from matplotlib>=3.0.0->pyclustering) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from matplotlib>=3.0.0->pyclustering) (4.47.2)
Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from matplotlib>=3.0.0->pyclustering) (1.4.5)
Requirement already satisfied: packaging>=20.0 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from matplotlib>=3.0.0->pyclustering) (23.2)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from matplotlib>=3.0.0->pyclustering) (3.1.1)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from matplotlib>=3.0.0->pyclustering) (2.8.2)
Requirement already satisfied: six>=1.5 in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from python-dateutil>=2.7->matplotlib>=3.0.0->pyclustering) (1.16.0)
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
'''Pract 5 5 Ensemble Learning (Any one)
A. Implement Random Forest Classifier model to predict the safety of the car.
In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Load the dataset (replace with your actual file path)
data = pd.read_csv('car_safety.csv')  # Update with your actual file path

# Print the first few rows of the dataset to understand its structure
print(data.head())

# Check the unique values in the target variable
print(data['unacc'].unique())

# Preprocess the data: handle missing values and encode categorical variables
data = data.dropna()  # Drop rows with missing values, or you can use imputation

# Example: Encoding categorical variables
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])

# Define features and target variable
X = data.drop('unacc', axis=1)  # Features
y = data['unacc']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
   vhigh vhigh.1  2 2.1  small   low  unacc
0  vhigh   vhigh  2   2  small   med  unacc
1  vhigh   vhigh  2   2  small  high  unacc
2  vhigh   vhigh  2   2    med   low  unacc
3  vhigh   vhigh  2   2    med   med  unacc
4  vhigh   vhigh  2   2    med  high  unacc
['unacc' 'acc' 'vgood' 'good']
Confusion Matrix:
[[108   1   7   2]
 [  1  12   0   4]
 [  1   0 360   0]
 [  2   0   0  21]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       118
           1       0.92      0.71      0.80        17
           2       0.98      1.00      0.99       361
           3       0.78      0.91      0.84        23

    accuracy                           0.97       519
   macro avg       0.91      0.88      0.89       519
weighted avg       0.97      0.97      0.96       519

In [ ]:
 
In [ ]:
#OR 
In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the car evaluation dataset
data = pd.read_csv("car_safety.csv")

# Encoding all the string data
data = data.apply(LabelEncoder().fit_transform)

# Define the features (X) and the target variable (y)
X = data.iloc[:, :-1]  # Features (all columns except the last one)
y = data.iloc[:, -1]   # Target variable (last column)

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("\nConfusion Matrix:\n", confusion)
print("\nClassification Report:\n", classification_rep)
Accuracy: 0.9624277456647399

Confusion Matrix:
 [[ 72   1   3   1]
 [  2  10   0   3]
 [  1   0 236   0]
 [  2   0   0  15]]

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94        77
           1       0.91      0.67      0.77        15
           2       0.99      1.00      0.99       237
           3       0.79      0.88      0.83        17

    accuracy                           0.96       346
   macro avg       0.91      0.87      0.88       346
weighted avg       0.96      0.96      0.96       346

In [9]:
!pip install xgboost
Collecting xgboost
  Downloading xgboost-2.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Requirement already satisfied: numpy in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from xgboost) (1.26.3)
Requirement already satisfied: scipy in c:\users\lenovo\appdata\local\programs\python\python312\lib\site-packages (from xgboost) (1.11.4)
Downloading xgboost-2.1.2-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
    --------------------------------------- 1.8/124.9 MB 10.1 MB/s eta 0:00:13
   - -------------------------------------- 3.1/124.9 MB 8.0 MB/s eta 0:00:16
   - -------------------------------------- 4.2/124.9 MB 7.2 MB/s eta 0:00:17
   - -------------------------------------- 5.2/124.9 MB 6.8 MB/s eta 0:00:18
   -- ------------------------------------- 6.3/124.9 MB 6.1 MB/s eta 0:00:20
   -- ------------------------------------- 7.3/124.9 MB 5.9 MB/s eta 0:00:20
   -- ------------------------------------- 8.7/124.9 MB 6.0 MB/s eta 0:00:20
   -- ------------------------------------- 9.2/124.9 MB 5.8 MB/s eta 0:00:20
   --- ------------------------------------ 10.2/124.9 MB 5.5 MB/s eta 0:00:21
   --- ------------------------------------ 11.3/124.9 MB 5.5 MB/s eta 0:00:21
   ---- ----------------------------------- 12.6/124.9 MB 5.6 MB/s eta 0:00:21
   ---- ----------------------------------- 14.2/124.9 MB 5.7 MB/s eta 0:00:20
   ----- ---------------------------------- 15.7/124.9 MB 5.9 MB/s eta 0:00:19
   ----- ---------------------------------- 16.5/124.9 MB 5.7 MB/s eta 0:00:19
   ----- ---------------------------------- 17.3/124.9 MB 5.6 MB/s eta 0:00:20
   ----- ---------------------------------- 18.1/124.9 MB 5.6 MB/s eta 0:00:20
   ------ --------------------------------- 19.4/124.9 MB 5.6 MB/s eta 0:00:19
   ------ --------------------------------- 21.2/124.9 MB 5.7 MB/s eta 0:00:19
   ------- -------------------------------- 22.8/124.9 MB 5.8 MB/s eta 0:00:18
   ------- -------------------------------- 23.9/124.9 MB 5.9 MB/s eta 0:00:18
   ------- -------------------------------- 24.6/124.9 MB 5.7 MB/s eta 0:00:18
   -------- ------------------------------- 25.4/124.9 MB 5.7 MB/s eta 0:00:18
   -------- ------------------------------- 26.7/124.9 MB 5.6 MB/s eta 0:00:18
   -------- ------------------------------- 28.0/124.9 MB 5.7 MB/s eta 0:00:18
   --------- ------------------------------ 29.9/124.9 MB 5.8 MB/s eta 0:00:17
   --------- ------------------------------ 31.2/124.9 MB 5.8 MB/s eta 0:00:17
   ---------- ----------------------------- 31.5/124.9 MB 5.7 MB/s eta 0:00:17
   ---------- ----------------------------- 31.5/124.9 MB 5.7 MB/s eta 0:00:17
   ---------- ----------------------------- 31.5/124.9 MB 5.7 MB/s eta 0:00:17
   ---------- ----------------------------- 31.5/124.9 MB 5.7 MB/s eta 0:00:17
   ---------- ----------------------------- 31.5/124.9 MB 5.7 MB/s eta 0:00:17
   ---------- ----------------------------- 31.7/124.9 MB 4.8 MB/s eta 0:00:20
   ---------- ----------------------------- 32.2/124.9 MB 4.7 MB/s eta 0:00:20
   ---------- ----------------------------- 33.8/124.9 MB 4.8 MB/s eta 0:00:20
   ----------- ---------------------------- 35.9/124.9 MB 4.9 MB/s eta 0:00:19
   ----------- ---------------------------- 36.4/124.9 MB 4.9 MB/s eta 0:00:19
   ----------- ---------------------------- 37.2/124.9 MB 4.9 MB/s eta 0:00:19
   ------------ --------------------------- 38.3/124.9 MB 4.9 MB/s eta 0:00:18
   ------------ --------------------------- 39.6/124.9 MB 4.9 MB/s eta 0:00:18
   ------------- -------------------------- 41.2/124.9 MB 4.9 MB/s eta 0:00:17
   ------------- -------------------------- 42.7/124.9 MB 5.0 MB/s eta 0:00:17
   -------------- ------------------------- 44.0/124.9 MB 5.0 MB/s eta 0:00:17
   -------------- ------------------------- 45.4/124.9 MB 5.1 MB/s eta 0:00:16
   -------------- ------------------------- 46.4/124.9 MB 5.1 MB/s eta 0:00:16
   --------------- ------------------------ 47.2/124.9 MB 5.0 MB/s eta 0:00:16
   --------------- ------------------------ 48.5/124.9 MB 5.1 MB/s eta 0:00:16
   ---------------- ----------------------- 50.1/124.9 MB 5.1 MB/s eta 0:00:15
   ---------------- ----------------------- 51.6/124.9 MB 5.2 MB/s eta 0:00:15
   ---------------- ----------------------- 53.0/124.9 MB 5.2 MB/s eta 0:00:14
   ----------------- ---------------------- 54.0/124.9 MB 5.2 MB/s eta 0:00:14
   ----------------- ---------------------- 55.1/124.9 MB 5.2 MB/s eta 0:00:14
   ----------------- ---------------------- 56.1/124.9 MB 5.2 MB/s eta 0:00:14
   ------------------ --------------------- 57.4/124.9 MB 5.2 MB/s eta 0:00:13
   ------------------ --------------------- 58.7/124.9 MB 5.2 MB/s eta 0:00:13
   ------------------- -------------------- 59.8/124.9 MB 5.2 MB/s eta 0:00:13
   ------------------- -------------------- 61.1/124.9 MB 5.2 MB/s eta 0:00:13
   -------------------- ------------------- 62.7/124.9 MB 5.3 MB/s eta 0:00:12
   -------------------- ------------------- 64.0/124.9 MB 5.3 MB/s eta 0:00:12
   -------------------- ------------------- 65.0/124.9 MB 5.3 MB/s eta 0:00:12
   --------------------- ------------------ 66.3/124.9 MB 5.3 MB/s eta 0:00:12
   --------------------- ------------------ 67.4/124.9 MB 5.3 MB/s eta 0:00:11
   --------------------- ------------------ 68.7/124.9 MB 5.3 MB/s eta 0:00:11
   ---------------------- ----------------- 70.5/124.9 MB 5.4 MB/s eta 0:00:11
   ---------------------- ----------------- 71.3/124.9 MB 5.4 MB/s eta 0:00:10
   ----------------------- ---------------- 71.8/124.9 MB 5.3 MB/s eta 0:00:10
   ----------------------- ---------------- 72.9/124.9 MB 5.3 MB/s eta 0:00:10
   ----------------------- ---------------- 73.9/124.9 MB 5.3 MB/s eta 0:00:10
   ------------------------ --------------- 75.2/124.9 MB 5.3 MB/s eta 0:00:10
   ------------------------ --------------- 76.8/124.9 MB 5.3 MB/s eta 0:00:10
   ------------------------- -------------- 78.6/124.9 MB 5.4 MB/s eta 0:00:09
   ------------------------- -------------- 79.7/124.9 MB 5.4 MB/s eta 0:00:09
   ------------------------- -------------- 80.5/124.9 MB 5.4 MB/s eta 0:00:09
   -------------------------- ------------- 81.5/124.9 MB 5.4 MB/s eta 0:00:09
   -------------------------- ------------- 82.6/124.9 MB 5.4 MB/s eta 0:00:08
   -------------------------- ------------- 84.1/124.9 MB 5.4 MB/s eta 0:00:08
   --------------------------- ------------ 86.0/124.9 MB 5.4 MB/s eta 0:00:08
   --------------------------- ------------ 87.3/124.9 MB 5.4 MB/s eta 0:00:07
   ---------------------------- ----------- 87.6/124.9 MB 5.4 MB/s eta 0:00:07
   ---------------------------- ----------- 88.3/124.9 MB 5.4 MB/s eta 0:00:07
   ---------------------------- ----------- 89.1/124.9 MB 5.4 MB/s eta 0:00:07
   ---------------------------- ----------- 90.4/124.9 MB 5.4 MB/s eta 0:00:07
   ----------------------------- ---------- 91.8/124.9 MB 5.4 MB/s eta 0:00:07
   ----------------------------- ---------- 93.6/124.9 MB 5.4 MB/s eta 0:00:06
   ------------------------------ --------- 95.2/124.9 MB 5.4 MB/s eta 0:00:06
   ------------------------------ --------- 95.9/124.9 MB 5.4 MB/s eta 0:00:06
   ------------------------------- -------- 97.0/124.9 MB 5.4 MB/s eta 0:00:06
   ------------------------------- -------- 98.0/124.9 MB 5.4 MB/s eta 0:00:05
   ------------------------------- -------- 99.4/124.9 MB 5.4 MB/s eta 0:00:05
   -------------------------------- ------- 101.2/124.9 MB 5.4 MB/s eta 0:00:05
   -------------------------------- ------- 102.5/124.9 MB 5.5 MB/s eta 0:00:05
   --------------------------------- ------ 103.5/124.9 MB 5.5 MB/s eta 0:00:04
   --------------------------------- ------ 104.6/124.9 MB 5.4 MB/s eta 0:00:04
   --------------------------------- ------ 105.9/124.9 MB 5.5 MB/s eta 0:00:04
   ---------------------------------- ----- 107.2/124.9 MB 5.5 MB/s eta 0:00:04
   ---------------------------------- ----- 108.8/124.9 MB 5.5 MB/s eta 0:00:03
   ----------------------------------- ---- 109.6/124.9 MB 5.5 MB/s eta 0:00:03
   ----------------------------------- ---- 110.6/124.9 MB 5.5 MB/s eta 0:00:03
   ----------------------------------- ---- 111.7/124.9 MB 5.5 MB/s eta 0:00:03
   ------------------------------------ --- 113.2/124.9 MB 5.5 MB/s eta 0:00:03
   ------------------------------------ --- 115.1/124.9 MB 5.5 MB/s eta 0:00:02
   ------------------------------------- -- 115.9/124.9 MB 5.5 MB/s eta 0:00:02
   ------------------------------------- -- 116.9/124.9 MB 5.5 MB/s eta 0:00:02
   ------------------------------------- -- 118.0/124.9 MB 5.5 MB/s eta 0:00:02
   -------------------------------------- - 119.3/124.9 MB 5.5 MB/s eta 0:00:02
   -------------------------------------- - 121.1/124.9 MB 5.5 MB/s eta 0:00:01
   ---------------------------------------  121.9/124.9 MB 5.5 MB/s eta 0:00:01
   ---------------------------------------  123.2/124.9 MB 5.5 MB/s eta 0:00:01
   ---------------------------------------  124.5/124.9 MB 5.5 MB/s eta 0:00:01
   ---------------------------------------  124.8/124.9 MB 5.5 MB/s eta 0:00:01
   ---------------------------------------- 124.9/124.9 MB 5.5 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-2.1.2
In [ ]:
 
In [ ]:
 
In [ ]:
'''pract 5 - B B. Use different voting mechanism and Apply AdaBoost (Adaptive Boosting), Gradient
Tree Boosting (GBM), XGBoost classification on Iris dataset and compare the
performance of three models using different evaluation measures.
Dataset Link: https://www.kaggle.com/datasets/uciml/iris
In [10]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize classifiers
ada_classifier = AdaBoostClassifier(n_estimators=100, random_state=42)
gbm_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
xgb_classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Train the classifiers
ada_classifier.fit(X_train, y_train)
gbm_classifier.fit(X_train, y_train)
xgb_classifier.fit(X_train, y_train)

# Make predictions
ada_pred = ada_classifier.predict(X_test)
gbm_pred = gbm_classifier.predict(X_test)
xgb_pred = xgb_classifier.predict(X_test)

# Evaluate the models
def evaluate_model(predictions, model_name):
    print(f"Evaluation for {model_name}:")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("\nClassification Report:")
    print(classification_report(y_test, predictions))
    print(f"Accuracy: {accuracy_score(y_test, predictions)}\n")

# Display evaluation for each model
evaluate_model(ada_pred, "AdaBoost")
evaluate_model(gbm_pred, "Gradient Boosting")
evaluate_model(xgb_pred, "XGBoost")
C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\ensemble\_weight_boosting.py:519: FutureWarning: The SAMME.R algorithm (the default) is deprecated and will be removed in 1.6. Use the SAMME algorithm to circumvent this warning.
  warnings.warn(
C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\core.py:158: UserWarning: [23:54:35] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
Evaluation for AdaBoost:
Confusion Matrix:
[[19  0  0]
 [ 0 13  0]
 [ 0  0 13]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

Accuracy: 1.0

Evaluation for Gradient Boosting:
Confusion Matrix:
[[19  0  0]
 [ 0 13  0]
 [ 0  0 13]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

Accuracy: 1.0

Evaluation for XGBoost:
Confusion Matrix:
[[19  0  0]
 [ 0 13  0]
 [ 0  0 13]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

Accuracy: 1.0

In [ ]:
 
In [ ]:
'''Pract 6- C. Build a Tic-Tac-Toe game using reinforcement learning in Python by using following
tasks
a. Setting up the environment
b. Defining the Tic-Tac-Toe game
c. Building the reinforcement learning model
d. Training the model
e. Testing the model
In [15]:
import numpy as np
import random

class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.done = False
        self.winner = None

    def reset(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.done = False
        self.winner = None
        return self.board

    def check_winner(self):
        for i in range(3):
            if abs(sum(self.board[i, :])) == 3:
                self.winner = np.sign(self.board[i, 0])
                return True
            if abs(sum(self.board[:, i])) == 3:
                self.winner = np.sign(self.board[0, i])
                return True
        if abs(self.board[0, 0] + self.board[1, 1] + self.board[2, 2]) == 3:
            self.winner = np.sign(self.board[0, 0])
            return True
        if abs(self.board[0, 2] + self.board[1, 1] + self.board[2, 0]) == 3:
            self.winner = np.sign(self.board[0, 2])
            return True
        if not np.any(self.board == 0):
            self.winner = 0  # Draw
            return True
        return False

    def step(self, player, action):
        if self.board[action[0], action[1]] != 0 or self.done:
            return False
        self.board[action[0], action[1]] = player
        self.done = self.check_winner()
        return True

    def available_actions(self):
        return list(zip(*np.where(self.board == 0)))

# Q-learning agent
class QLearningAgent:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=1.0):
        self.q_table = {}
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon

    def get_q_values(self, state):
        return self.q_table.get(state.tobytes(), np.zeros(9))

    def choose_action(self, state, available_actions):
        if random.random() < self.epsilon:
            return random.choice(available_actions)
        q_values = self.get_q_values(state)
        best_action_index = np.argmax(q_values)
        best_action = available_actions[best_action_index % len(available_actions)]
        return best_action

    def update_q_value(self, state, action, reward, next_state):
        q_values = self.get_q_values(state)
        action_index = action[0] * 3 + action[1]
        next_max = np.max(self.get_q_values(next_state))
        q_values[action_index] += self.alpha * (reward + self.gamma * next_max - q_values[action_index])
        self.q_table[state.tobytes()] = q_values

# Training the agent
env = TicTacToe()
agent = QLearningAgent()

episodes = 1000
for episode in range(episodes):
    state = env.reset()
    done = False
    while not done:
        available_actions = env.available_actions()
        action = agent.choose_action(state, available_actions)
        player = 1 if episode % 2 == 0 else -1  # Alternate players
        env.step(player, action)
        next_state = env.board
        reward = 1 if env.winner == player else -1 if env.winner == -player else 0
        agent.update_q_value(state, action, reward, next_state)
        state = next_state
        if env.done:
            break
    if episode % 100 == 0:
        print(f"Episode {episode} completed.")

# Testing the agent
for test_game in range(3):
    state = env.reset()
    done = False
    print(f"\nTest Game {test_game + 1}")
    while not done:
        available_actions = env.available_actions()
        action = agent.choose_action(state, available_actions)
        player = 1 if test_game % 2 == 0 else -1
        env.step(player, action)
        print(env.board)
        state = env.board
        if env.done:
            if env.winner == 1:
                print("Agent (Player 1) wins!")
            elif env.winner == -1:
                print("Opponent (Player -1) wins!")
            else:
                print("It's a draw!")
            break
Episode 0 completed.
Episode 100 completed.
Episode 200 completed.
Episode 300 completed.
Episode 400 completed.
Episode 500 completed.
Episode 600 completed.
Episode 700 completed.
Episode 800 completed.
Episode 900 completed.

Test Game 1
[[1 0 0]
 [0 0 0]
 [0 0 0]]
[[1 0 0]
 [0 1 0]
 [0 0 0]]
[[1 0 0]
 [1 1 0]
 [0 0 0]]
[[1 1 0]
 [1 1 0]
 [0 0 0]]
[[1 1 1]
 [1 1 0]
 [0 0 0]]
Agent (Player 1) wins!

Test Game 2
[[ 0 -1  0]
 [ 0  0  0]
 [ 0  0  0]]
[[-1 -1  0]
 [ 0  0  0]
 [ 0  0  0]]
[[-1 -1 -1]
 [ 0  0  0]
 [ 0  0  0]]
Opponent (Player -1) wins!

Test Game 3
[[0 0 0]
 [0 0 0]
 [0 1 0]]
[[0 0 1]
 [0 0 0]
 [0 1 0]]
[[1 0 1]
 [0 0 0]
 [0 1 0]]
[[1 0 1]
 [1 0 0]
 [0 1 0]]
[[1 1 1]
 [1 0 0]
 [0 1 0]]
Agent (Player 1) wins!
In [ ]:
 
In [ ]:
#OR 
In [16]:
import numpy as np
import random


# Task a & b: Setting up the Tic-Tac-Toe environment
class TicTacToeEnv:
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.done = False
        self.current_player = 1  # 1 for 'X', -1 for 'O'
        return tuple(self.board.flatten())
    
    def available_actions(self):
        return [(i, j) for i in range(3) for j in range(3) if self.board[i, j] == 0]
    
    def step(self, action):
        if self.done:
            return tuple(self.board.flatten()), 0, True  # Game is already over
        i, j = action
        self.board[i, j] = self.current_player
        reward = self.check_winner()
        self.done = reward != 0 or not self.available_actions()
        self.current_player *= -1
        return tuple(self.board.flatten()), reward, self.done

    def check_winner(self):
        for i in range(3):
            if abs(sum(self.board[i, :])) == 3 or abs(sum(self.board[:, i])) == 3:
                return 1 * self.current_player
        if abs(sum(self.board.diagonal())) == 3 or abs(sum(np.fliplr(self.board).diagonal())) == 3:
            return 1 * self.current_player
        return 0


# Task c: Building the Q-learning model
Q = {}


def choose_action(state, epsilon=0.1):
    if state not in Q:
        Q[state] = {a: 0 for a in env.available_actions()}
    return random.choice(env.available_actions()) if random.random() < epsilon else max(Q[state], key=Q[state].get)

def update_q(state, action, reward, next_state, alpha=0.1, gamma=0.95):
    if state not in Q:
        Q[state] = {a: 0 for a in env.available_actions()}
    if next_state not in Q:
        # Set Q[next_state] with a default value of 0 if no available actions
        Q[next_state] = {a: 0 for a in env.available_actions()} or {(0, 0): 0} 
    
    # Q-learning update rule with terminal state check
    max_future_q = max(Q[next_state].values()) if Q[next_state] else 0
    Q[state][action] += alpha * (reward + gamma * max_future_q - Q[state][action])



# Task d: Training the model
env = TicTacToeEnv()
for episode in range(10000):
    state = env.reset()
    done = False
    while not done:
        action = choose_action(state)
        next_state, reward, done = env.step(action)
        update_q(state, action, reward, next_state)
        state = next_state


# Task e: Testing the model
def test_model():
    state = env.reset()
    done = False
    while not done:
        action = choose_action(state, epsilon=0)  # Always exploit
        next_state, reward, done = env.step(action)
        print(np.reshape(next_state, (3, 3)))
        state = next_state
        if done:
            if reward > 0:
                print("AI won!")
            elif reward < 0:
                print("AI lost!")
            else:
                print("It's a draw!")


# Run a test
test_model()
[[1 0 0]
 [0 0 0]
 [0 0 0]]
[[ 1 -1  0]
 [ 0  0  0]
 [ 0  0  0]]
[[ 1 -1  0]
 [ 0  0  0]
 [ 1  0  0]]
[[ 1 -1 -1]
 [ 0  0  0]
 [ 1  0  0]]
[[ 1 -1 -1]
 [ 1  0  0]
 [ 1  0  0]]
AI won!
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
#---------------------------------------------------------DMV-----------------------------------------------------------------------------------
In [ ]:
 
In [ ]:
'''Pract 7 - 7 Data Loading, Storage and File Formats
Problem Statement: Analyzing Sales Data from Multiple File Formats
Dataset: Sales data in multiple file formats (e.g., CSV, Excel, JSON)
Description: The goal is to load and analyze sales data from different file formats, including
CSV, Excel, and JSON, and perform data cleaning, transformation, and analysis on the
dataset.
Tasks to Perform:
Obtain sales data files in various formats, such as CSV, Excel, and JSON.
1. Load the sales data from each file format into the appropriate data structures or
dataframes.
2. Explore the structure and content of the loaded data, identifying any inconsistencies,
missing values, or data quality issues.
3. Perform data cleaning operations, such as handling missing values, removing
duplicates, or correcting inconsistencies.
4. Convert the data into a unified format, such as a common dataframe or data structure,
to enable seamless analysis 
5. Perform data transformation tasks, such as merging multiple datasets, splitting
columns, or deriving new variables.
6. Analyze the sales data by performing descriptive statistics, aggregating data by
specific variables, or calculating metrics such as total sales, average order value, or
product category distribution.
7. Create visualizations, such as bar plots, pie charts, or box plots, to represent the sales
data and gain insights into sales trends, customer behavior, or product performance.
In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob


# Load CSV
df_csv = pd.read_csv("sales_data_sample.csv" , encoding="cp1252")

# Load Excel
df_excel = pd.read_excel("Sample-Sales-Data.xlsx")

# Load JSON
df_json = pd.read_json("sales3.json")


print("CSV Data Head:\n", df_csv.head())
print("Excel Data Head:\n", df_excel.head())
print("JSON Data Head:\n", df_json.head())

print("\nCSV Data Info:")
print(df_csv.info())
print("\nExcel Data Info:")
print(df_excel.info())
print("\nJSON Data Info:")
print(df_json.info())


# Check for missing values
print("CSV Missing Values:\n", df_csv.isnull().sum())
print("Excel Missing Values:\n", df_excel.isnull().sum())
print("JSON Missing Values:\n", df_json.isnull().sum())

# Fill or drop missing values (example: filling with mean for numerical columns)
df_csv.fillna(df_csv.mean(), inplace=True)
df_excel.fillna(df_excel.mean(), inplace=True)
df_json.fillna(df_json.mean(), inplace=True)

# Remove duplicates
df_csv.drop_duplicates(inplace=True)
df_excel.drop_duplicates(inplace=True)
df_json.drop_duplicates(inplace=True)



# Standardize column names across datasets if they differ
df_csv.columns = df_csv.columns.str.lower()
df_excel.columns = df_excel.columns.str.lower()
df_json.columns = df_json.columns.str.lower()



print(df_csv.dtypes)
print(df_excel.dtypes)
print(df_json.dtypes)


'''
# Combine datasets into a single DataFrame
combined_df = pd.concat([df_csv, df_excel, df_json], ignore_index=True)
print("Combined Data Head:\n", combined_df.head())



# Example: Create new column for Total Sales
combined_df['total_sales'] = combined_df['quantity'] * combined_df['price']

# Example: Extract year and month from a 'date' column
combined_df['year'] = pd.DatetimeIndex(combined_df['date']).year
combined_df['month'] = pd.DatetimeIndex(combined_df['date']).month



# Calculate total and average sales
total_sales = combined_df['total_sales'].sum()
average_order_value = combined_df['total_sales'].mean()

# Aggregate sales by product category
sales_by_category = combined_df.groupby('category')['total_sales'].sum()

# Display analysis results
print("Total Sales:", total_sales)
print("Average Order Value:", average_order_value)
print("Sales by Category:\n", sales_by_category)



# Set plot style
sns.set(style="whitegrid")

# Bar plot for sales by category
plt.figure(figsize=(10, 6))
sns.barplot(x=sales_by_category.index, y=sales_by_category.values)
plt.title("Total Sales by Product Category")
plt.xlabel("Category")
plt.ylabel("Total Sales")
plt.xticks(rotation=45)
plt.show()

# Pie chart for category distribution
plt.figure(figsize=(8, 8))
combined_df['category'].value_counts().plot.pie(autopct='%1.1f%%')
plt.title("Product Category Distribution")
plt.ylabel("")
plt.show()

# Box plot to analyze order values
plt.figure(figsize=(10, 6))
sns.boxplot(x="category", y="total_sales", data=combined_df)
plt.title("Order Value Distribution by Category")
plt.xlabel("Category")
plt.ylabel("Order Value")
plt.xticks(rotation=45)
plt.show()
'''
CSV Data Head:
    ORDERNUMBER  QUANTITYORDERED  PRICEEACH  ORDERLINENUMBER    SALES  \
0        10107               30      95.70                2  2871.00   
1        10121               34      81.35                5  2765.90   
2        10134               41      94.74                2  3884.34   
3        10145               45      83.26                6  3746.70   
4        10159               49     100.00               14  5205.27   

         ORDERDATE   STATUS  QTR_ID  MONTH_ID  YEAR_ID  ...  \
0   2/24/2003 0:00  Shipped       1         2     2003  ...   
1    5/7/2003 0:00  Shipped       2         5     2003  ...   
2    7/1/2003 0:00  Shipped       3         7     2003  ...   
3   8/25/2003 0:00  Shipped       3         8     2003  ...   
4  10/10/2003 0:00  Shipped       4        10     2003  ...   

                    ADDRESSLINE1  ADDRESSLINE2           CITY STATE  \
0        897 Long Airport Avenue           NaN            NYC    NY   
1             59 rue de l'Abbaye           NaN          Reims   NaN   
2  27 rue du Colonel Pierre Avia           NaN          Paris   NaN   
3             78934 Hillside Dr.           NaN       Pasadena    CA   
4                7734 Strong St.           NaN  San Francisco    CA   

  POSTALCODE COUNTRY TERRITORY CONTACTLASTNAME CONTACTFIRSTNAME DEALSIZE  
0      10022     USA       NaN              Yu             Kwai    Small  
1      51100  France      EMEA         Henriot             Paul    Small  
2      75508  France      EMEA        Da Cunha           Daniel   Medium  
3      90003     USA       NaN           Young            Julie   Medium  
4        NaN     USA       NaN           Brown            Julie   Medium  

[5 rows x 25 columns]
Excel Data Head:
    Postcode  Sales_Rep_ID Sales_Rep_Name  Year         Value
0      2121           456           Jane  2011  84219.497311
1      2092           789         Ashish  2012  28322.192268
2      2128           456           Jane  2013  81878.997241
3      2073           123           John  2011  44491.142121
4      2134           789         Ashish  2012  71837.720959
JSON Data Head:
    id                   email    first     last  \
0   1  isidro_von@hotmail.com   Torrey     Veum   
1   2  frederique19@gmail.com    Micah  Sanford   
2   3       fredy54@gmail.com   Hollis    Swift   
3   4   braxton29@hotmail.com    Perry  Leffler   
4   5      turner59@gmail.com  Janelle  Hagenes   

                          company                       created_at  \
0          Hilll, Mayert and Wolf 2014-12-25 04:06:27.981000+00:00   
1                  Stokes-Reichel 2014-07-03 16:08:17.044000+00:00   
2  Rodriguez, Cartwright and Kuhn 2014-08-18 06:15:16.731000+00:00   
3        Sipes, Feeney and Hansen 2014-07-10 11:31:40.235000+00:00   
4             Lesch and Daughters 2014-04-21 15:05:43.229000+00:00   

                                 country  
0                            Switzerland  
1  Democratic People's Republic of Korea  
2                                Tunisia  
3                                   Chad  
4                              Swaziland  

CSV Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2823 entries, 0 to 2822
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ORDERNUMBER       2823 non-null   int64  
 1   QUANTITYORDERED   2823 non-null   int64  
 2   PRICEEACH         2823 non-null   float64
 3   ORDERLINENUMBER   2823 non-null   int64  
 4   SALES             2823 non-null   float64
 5   ORDERDATE         2823 non-null   object 
 6   STATUS            2823 non-null   object 
 7   QTR_ID            2823 non-null   int64  
 8   MONTH_ID          2823 non-null   int64  
 9   YEAR_ID           2823 non-null   int64  
 10  PRODUCTLINE       2823 non-null   object 
 11  MSRP              2823 non-null   int64  
 12  PRODUCTCODE       2823 non-null   object 
 13  CUSTOMERNAME      2823 non-null   object 
 14  PHONE             2823 non-null   object 
 15  ADDRESSLINE1      2823 non-null   object 
 16  ADDRESSLINE2      302 non-null    object 
 17  CITY              2823 non-null   object 
 18  STATE             1337 non-null   object 
 19  POSTALCODE        2747 non-null   object 
 20  COUNTRY           2823 non-null   object 
 21  TERRITORY         1749 non-null   object 
 22  CONTACTLASTNAME   2823 non-null   object 
 23  CONTACTFIRSTNAME  2823 non-null   object 
 24  DEALSIZE          2823 non-null   object 
dtypes: float64(2), int64(7), object(16)
memory usage: 551.5+ KB
None

Excel Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390 entries, 0 to 389
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Postcode        390 non-null    int64  
 1   Sales_Rep_ID    390 non-null    int64  
 2   Sales_Rep_Name  390 non-null    object 
 3   Year            390 non-null    int64  
 4   Value           390 non-null    float64
dtypes: float64(1), int64(3), object(1)
memory usage: 15.4+ KB
None

JSON Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   id          9999 non-null   int64              
 1   email       9999 non-null   object             
 2   first       9999 non-null   object             
 3   last        9999 non-null   object             
 4   company     9999 non-null   object             
 5   created_at  9999 non-null   datetime64[ns, UTC]
 6   country     9999 non-null   object             
dtypes: datetime64[ns, UTC](1), int64(1), object(5)
memory usage: 546.9+ KB
None
CSV Missing Values:
 ORDERNUMBER            0
QUANTITYORDERED        0
PRICEEACH              0
ORDERLINENUMBER        0
SALES                  0
ORDERDATE              0
STATUS                 0
QTR_ID                 0
MONTH_ID               0
YEAR_ID                0
PRODUCTLINE            0
MSRP                   0
PRODUCTCODE            0
CUSTOMERNAME           0
PHONE                  0
ADDRESSLINE1           0
ADDRESSLINE2        2521
CITY                   0
STATE               1486
POSTALCODE            76
COUNTRY                0
TERRITORY           1074
CONTACTLASTNAME        0
CONTACTFIRSTNAME       0
DEALSIZE               0
dtype: int64
Excel Missing Values:
 Postcode          0
Sales_Rep_ID      0
Sales_Rep_Name    0
Year              0
Value             0
dtype: int64
JSON Missing Values:
 id            0
email         0
first         0
last          0
company       0
created_at    0
country       0
dtype: int64
ORDERNUMBER           int64
QUANTITYORDERED       int64
PRICEEACH           float64
ORDERLINENUMBER       int64
SALES               float64
ORDERDATE            object
STATUS               object
QTR_ID                int64
MONTH_ID              int64
YEAR_ID               int64
PRODUCTLINE          object
MSRP                  int64
PRODUCTCODE          object
CUSTOMERNAME         object
PHONE                object
ADDRESSLINE1         object
ADDRESSLINE2         object
CITY                 object
STATE                object
POSTALCODE           object
COUNTRY              object
TERRITORY            object
CONTACTLASTNAME      object
CONTACTFIRSTNAME     object
DEALSIZE             object
dtype: object
Postcode            int64
Sales_Rep_ID        int64
Sales_Rep_Name     object
Year                int64
Value             float64
dtype: object
id                          int64
email                      object
first                      object
last                       object
company                    object
created_at    datetime64[ns, UTC]
country                    object
dtype: object
Out[3]:
'\n# Combine datasets into a single DataFrame\ncombined_df = pd.concat([df_csv, df_excel, df_json], ignore_index=True)\nprint("Combined Data Head:\n", combined_df.head())\n\n\n\n# Example: Create new column for Total Sales\ncombined_df[\'total_sales\'] = combined_df[\'quantity\'] * combined_df[\'price\']\n\n# Example: Extract year and month from a \'date\' column\ncombined_df[\'year\'] = pd.DatetimeIndex(combined_df[\'date\']).year\ncombined_df[\'month\'] = pd.DatetimeIndex(combined_df[\'date\']).month\n\n\n\n# Calculate total and average sales\ntotal_sales = combined_df[\'total_sales\'].sum()\naverage_order_value = combined_df[\'total_sales\'].mean()\n\n# Aggregate sales by product category\nsales_by_category = combined_df.groupby(\'category\')[\'total_sales\'].sum()\n\n# Display analysis results\nprint("Total Sales:", total_sales)\nprint("Average Order Value:", average_order_value)\nprint("Sales by Category:\n", sales_by_category)\n\n\n\n# Set plot style\nsns.set(style="whitegrid")\n\n# Bar plot for sales by category\nplt.figure(figsize=(10, 6))\nsns.barplot(x=sales_by_category.index, y=sales_by_category.values)\nplt.title("Total Sales by Product Category")\nplt.xlabel("Category")\nplt.ylabel("Total Sales")\nplt.xticks(rotation=45)\nplt.show()\n\n# Pie chart for category distribution\nplt.figure(figsize=(8, 8))\ncombined_df[\'category\'].value_counts().plot.pie(autopct=\'%1.1f%%\')\nplt.title("Product Category Distribution")\nplt.ylabel("")\nplt.show()\n\n# Box plot to analyze order values\nplt.figure(figsize=(10, 6))\nsns.boxplot(x="category", y="total_sales", data=combined_df)\nplt.title("Order Value Distribution by Category")\nplt.xlabel("Category")\nplt.ylabel("Order Value")\nplt.xticks(rotation=45)\nplt.show()\n'
In [ ]:
 
In [24]:
!pip install openpyxl
Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
In [ ]:
 
In [ ]:
#OR 
In [4]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import json

csv = pd.read_csv("sales_data_sample.csv", encoding="cp1252")

ed = pd.read_excel("Sample-Sales-Data.xlsx")


with open("customers.json", "r") as json_file:
    json_data = json.load(json_file)

csv.tail()


csv.info()


csv.describe()


csv.dropna()


csv.drop_duplicates()



ed.head()


ed.tail()



ed.info()



ed.describe()



unified_data = pd.concat([csv, ed], ignore_index=True)



total_sales = unified_data['SALES'].sum()
print("Total Sales:", total_sales)



category_sales = unified_data.groupby('ORDERNUMBER')['SALES'].mean()



category_counts = unified_data['SALES'].value_counts()
category_counts.plot(kind='bar')
plt.title('Product Category Distribution')
plt.xlabel('Category')
plt.ylabel('Count')
plt.show()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2823 entries, 0 to 2822
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ORDERNUMBER       2823 non-null   int64  
 1   QUANTITYORDERED   2823 non-null   int64  
 2   PRICEEACH         2823 non-null   float64
 3   ORDERLINENUMBER   2823 non-null   int64  
 4   SALES             2823 non-null   float64
 5   ORDERDATE         2823 non-null   object 
 6   STATUS            2823 non-null   object 
 7   QTR_ID            2823 non-null   int64  
 8   MONTH_ID          2823 non-null   int64  
 9   YEAR_ID           2823 non-null   int64  
 10  PRODUCTLINE       2823 non-null   object 
 11  MSRP              2823 non-null   int64  
 12  PRODUCTCODE       2823 non-null   object 
 13  CUSTOMERNAME      2823 non-null   object 
 14  PHONE             2823 non-null   object 
 15  ADDRESSLINE1      2823 non-null   object 
 16  ADDRESSLINE2      302 non-null    object 
 17  CITY              2823 non-null   object 
 18  STATE             1337 non-null   object 
 19  POSTALCODE        2747 non-null   object 
 20  COUNTRY           2823 non-null   object 
 21  TERRITORY         1749 non-null   object 
 22  CONTACTLASTNAME   2823 non-null   object 
 23  CONTACTFIRSTNAME  2823 non-null   object 
 24  DEALSIZE          2823 non-null   object 
dtypes: float64(2), int64(7), object(16)
memory usage: 551.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390 entries, 0 to 389
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Postcode        390 non-null    int64  
 1   Sales_Rep_ID    390 non-null    int64  
 2   Sales_Rep_Name  390 non-null    object 
 3   Year            390 non-null    int64  
 4   Value           390 non-null    float64
dtypes: float64(1), int64(3), object(1)
memory usage: 15.4+ KB
Total Sales: 10032628.85
No description has been provided for this image
In [ ]:
 
In [ ]:
#OR
In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df1=pd.read_csv('sales_data_sample.csv', encoding='ISO-8859-1')
df2=pd.read_json('sales.json')
df3=pd.read_excel('sales_data.xlsx')


df1.head()

df2.head()

df3.head()

df1.shape

df2.shape

df3.shape



df1.info()

#Checks for NA values in columns
df1.isna().sum()

#for calculating some statistical data like percentile, mean and std of the numeric
df1.describe()


#Dropping unnecessary columns
df1 = df1.drop(['ADDRESSLINE1','ADDRESSLINE2','CITY','STATE','TERRITORY'],axis = 1)

#Filling all NA values with mode of the POSTALCODE column
df1 = df1['POSTALCODE'].fillna(df1.POSTALCODE.mode(), inplace=True)

df2.info()

df2.isna().sum()

df2 = df2.drop(['ADDRESSLINE1','ADDRESSLINE2','CITY','STATE','TERRITORY'],axis = 1)

df2.describe()

df3.info()



df2.isna().sum()

df3.describe()

import pandas as pd
# Load the CSV file into a DataFrame
df1 = pd.read_csv('sales_data_sample.csv',encoding='ISO-8859-1')
df2 = pd.read_json('sales.json')
df3 = pd.read_excel('sales_data.xlsx')
# Check the data types of columns
data_types = df1.dtypes
print(data_types)


import seaborn as sns
import matplotlib.pyplot as plt
# Assuming your DataFrame is named df1
sns.histplot(x='STATUS', data=df1)
plt.show()


#Plotting histogram plot for STATUS column

import seaborn as sns
import matplotlib.pyplot as plt
sns.histplot(x='STATUS', data=df2, )
plt.show()


#Plotting histogram plot for STATUS column
import seaborn as sns
import matplotlib.pyplot as plt
sns.histplot(x='STATUS', data=df3, )
plt.show()


#Plotting histogram plot for MONTH_ID column
import seaborn as sns
import matplotlib.pyplot as plt
sns.histplot(x='MONTH_ID', data=df1, )
plt.show()


#Plotting histogram plot for MONTH_ID column
import seaborn as sns
import matplotlib.pyplot as plt
sns.histplot(x='MONTH_ID', data=df2, )
plt.show()



#Plotting histogram plot for MONTH_ID column
import seaborn as sns
import matplotlib.pyplot as plt

sns.histplot(x='MONTH_ID', data=df3, )
plt.show()



#Plotting boxplot for STATUS column against MONTH_ID column
sns.boxplot( x="STATUS", y= "MONTH_ID", data=df1, )
plt.show()


#Plotting boxplot for STATUS column against MONTH_ID column
sns.boxplot( x="STATUS", y= "MONTH_ID", data=df2, )
plt.show()


#Plotting boxplot for STATUS column against MONTH_ID column
sns.boxplot( x="STATUS", y= "MONTH_ID", data=df3, )
plt.show()


#Plotting Scatterplot
sns.scatterplot( x="STATUS", y="MONTH_ID", data=df1,
hue='COUNTRY', size='YEAR_ID')
# Placing Legend outside the Figure

plt.legend(bbox_to_anchor=(1, 1), loc=2)
plt.show()

#Plotting Scatterplot
sns.scatterplot( x="STATUS", y="MONTH_ID", data=df2,
hue='COUNTRY', size='YEAR_ID')
# Placing Legend outside the Figure
plt.legend(bbox_to_anchor=(1, 1), loc=2)
plt.show()


#Plotting Scatterplot
sns.scatterplot( x="STATUS", y="MONTH_ID", data=df3,
hue='COUNTRY', size='YEAR_ID')
# Placing Legend outside the Figure
plt.legend(bbox_to_anchor=(1, 1), loc=2)
plt.show()


#Checking the data only for shipped STATUS
data1=df1[df1["STATUS"]=='Shipped']
data1.head()


#Checking the data only for shipped STATUS
data2=df2[df2["STATUS"]=='Shipped']
data2.head()


#Checking the data only for shipped STATUS
data3=df3[df3["STATUS"]=='Shipped']
data3.head()

data1.shape



data2.shape



data3.shape



#Calculating sum for sales column
sum_sales = df1['SALES'].sum()
print("Addition of all sales",sum_sales)


#Calculating sum for sales column
sum_sales = df2['SALES'].sum()
print("Addition of all sales",sum_sales)

#Calculating sum for sales column
sum_sales = df3['SALES'].sum()
print("Addition of all sales",sum_sales)


#Calulating average for sales column
sales_avg = df1['SALES'].mean()
print("Average of total sales = ",sales_avg)


#Calulating average for sales column
sales_avg = df2['SALES'].mean()
print("Average of total sales = ",sales_avg)


#Calulating average for sales column
sales_avg = df3['SALES'].mean()
print("Average of total sales = ",sales_avg)


import sklearn
import pandas as pd
import seaborn as sns
# IQR
Q1 = np.percentile(df1['SALES'], 25,

interpolation = 'midpoint')

Q3 = np.percentile(df1['SALES'], 75,

interpolation = 'midpoint')

IQR = Q3 - Q1
print("Old Shape: ", df1.shape)
# Upper bound
upper = np.where(df1['SALES'] >= (Q3+1.5*IQR))
# Lower bound
lower = np.where(df1['SALES'] <= (Q1-1.5*IQR))
# Removing the Outliers
df1.drop(upper[0], inplace = True)
df1.drop(lower[0], inplace = True)
print("New Shape: ", df1.shape)
sns.boxplot(x='SALES', data=df1)

import sklearn
import pandas as pd
import seaborn as sns
# IQR
Q1 = np.percentile(df2['SALES'], 25,

interpolation = 'midpoint')

Q3 = np.percentile(df2['SALES'], 75,

interpolation = 'midpoint')

IQR = Q3 - Q1
print("Old Shape: ", df2.shape)
# Upper bound
upper = np.where(df2['SALES'] >= (Q3+1.5*IQR))
# Lower bound
lower = np.where(df2['SALES'] <= (Q1-1.5*IQR))
# Removing the Outliers
df2.drop(upper[0], inplace = True)



import sklearn
import pandas as pd
import seaborn as sns
# IQR
Q1 = np.percentile(df3['SALES'], 25,

interpolation = 'midpoint')

Q3 = np.percentile(df3['SALES'], 75,

interpolation = 'midpoint')

IQR = Q3 - Q1
print("Old Shape: ", df3.shape)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2823 entries, 0 to 2822
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ORDERNUMBER       2823 non-null   int64  
 1   QUANTITYORDERED   2823 non-null   int64  
 2   PRICEEACH         2823 non-null   float64
 3   ORDERLINENUMBER   2823 non-null   int64  
 4   SALES             2823 non-null   float64
 5   ORDERDATE         2823 non-null   object 
 6   STATUS            2823 non-null   object 
 7   QTR_ID            2823 non-null   int64  
 8   MONTH_ID          2823 non-null   int64  
 9   YEAR_ID           2823 non-null   int64  
 10  PRODUCTLINE       2823 non-null   object 
 11  MSRP              2823 non-null   int64  
 12  PRODUCTCODE       2823 non-null   object 
 13  CUSTOMERNAME      2823 non-null   object 
 14  PHONE             2823 non-null   object 
 15  ADDRESSLINE1      2823 non-null   object 
 16  ADDRESSLINE2      302 non-null    object 
 17  CITY              2823 non-null   object 
 18  STATE             1337 non-null   object 
 19  POSTALCODE        2747 non-null   object 
 20  COUNTRY           2823 non-null   object 
 21  TERRITORY         1749 non-null   object 
 22  CONTACTLASTNAME   2823 non-null   object 
 23  CONTACTFIRSTNAME  2823 non-null   object 
 24  DEALSIZE          2823 non-null   object 
dtypes: float64(2), int64(7), object(16)
memory usage: 551.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2823 entries, 0 to 2822
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ORDERNUMBER       2823 non-null   int64  
 1   QUANTITYORDERED   2823 non-null   int64  
 2   PRICEEACH         2823 non-null   float64
 3   ORDERLINENUMBER   2823 non-null   int64  
 4   SALES             2823 non-null   float64
 5   ORDERDATE         2823 non-null   object 
 6   STATUS            2823 non-null   object 
 7   QTR_ID            2823 non-null   int64  
 8   MONTH_ID          2823 non-null   int64  
 9   YEAR_ID           2823 non-null   int64  
 10  PRODUCTLINE       2823 non-null   object 
 11  MSRP              2823 non-null   int64  
 12  PRODUCTCODE       2823 non-null   object 
 13  CUSTOMERNAME      2823 non-null   object 
 14  PHONE             2823 non-null   object 
 15  ADDRESSLINE1      2823 non-null   object 
 16  ADDRESSLINE2      2823 non-null   object 
 17  CITY              2823 non-null   object 
 18  STATE             2823 non-null   object 
 19  POSTALCODE        2823 non-null   object 
 20  COUNTRY           2823 non-null   object 
 21  TERRITORY         2823 non-null   object 
 22  CONTACTLASTNAME   2823 non-null   object 
 23  CONTACTFIRSTNAME  2823 non-null   object 
 24  DEALSIZE          2823 non-null   object 
dtypes: float64(2), int64(7), object(16)
memory usage: 551.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2823 entries, 0 to 2822
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ORDERNUMBER       2823 non-null   int64  
 1   QUANTITYORDERED   2823 non-null   int64  
 2   PRICEEACH         2823 non-null   float64
 3   ORDERLINENUMBER   2823 non-null   int64  
 4   SALES             2823 non-null   float64
 5   ORDERDATE         2823 non-null   object 
 6   STATUS            2823 non-null   object 
 7   QTR_ID            2823 non-null   int64  
 8   MONTH_ID          2823 non-null   int64  
 9   YEAR_ID           2823 non-null   int64  
 10  PRODUCTLINE       2823 non-null   object 
 11  MSRP              2823 non-null   int64  
 12  PRODUCTCODE       2823 non-null   object 
 13  CUSTOMERNAME      2823 non-null   object 
 14  PHONE             2823 non-null   object 
 15  ADDRESSLINE1      2823 non-null   object 
 16  ADDRESSLINE2      302 non-null    object 
 17  CITY              2823 non-null   object 
 18  STATE             1337 non-null   object 
 19  POSTALCODE        2747 non-null   object 
 20  COUNTRY           2823 non-null   object 
 21  TERRITORY         1749 non-null   object 
 22  CONTACTLASTNAME   2823 non-null   object 
 23  CONTACTFIRSTNAME  2823 non-null   object 
 24  DEALSIZE          2823 non-null   object 
dtypes: float64(2), int64(7), object(16)
memory usage: 551.5+ KB
ORDERNUMBER           int64
QUANTITYORDERED       int64
PRICEEACH           float64
ORDERLINENUMBER       int64
SALES               float64
ORDERDATE            object
STATUS               object
QTR_ID                int64
MONTH_ID              int64
YEAR_ID               int64
PRODUCTLINE          object
MSRP                  int64
PRODUCTCODE          object
CUSTOMERNAME         object
PHONE                object
ADDRESSLINE1         object
ADDRESSLINE2         object
CITY                 object
STATE                object
POSTALCODE           object
COUNTRY              object
TERRITORY            object
CONTACTLASTNAME      object
CONTACTFIRSTNAME     object
DEALSIZE             object
dtype: object
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Addition of all sales 10032628.85
Addition of all sales 10032628.85
Addition of all sales 10032628.85
Average of total sales =  3553.889071909316
Average of total sales =  3553.889071909316
Average of total sales =  3553.889071909316
Old Shape:  (2823, 25)
New Shape:  (2742, 25)
Old Shape:  (2823, 25)
Old Shape:  (2823, 25)
No description has been provided for this image
In [ ]:
 
In [ ]:
#OR 
In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re  # Importing the re module for regular expressions

# Load datasets
dcsv = pd.read_csv('customers.csv')
djson = pd.read_json('customers.json')
dxlsx = pd.read_excel('customers.xlsx')

# Display first few rows of each dataset
print(dcsv.head())
print(djson.head())
print(dxlsx.head())

# Check for missing values in each dataset
print(f"Missing values in csv\n{dcsv.isna().sum()}")
print(f"\nMissing values in json\n{djson.isna().sum()}")
print(f"\nMissing values in xlsx\n{dxlsx.isna().sum()}")

# Get information about each dataset
print(f"info of csv")
dcsv.info()
print(f"\ninfo of json")
djson.info()
print(f"\ninfo of xlsx")
dxlsx.info()

# Fill missing values with 0
dcsv.fillna(0, inplace=True)
djson.fillna(0, inplace=True)
dxlsx.fillna(0, inplace=True)

# Check for duplicates
print("Duplicate entries in csv:\n", dcsv[dcsv.duplicated()])
print("Duplicate entries in json:\n", djson[djson.duplicated()])
print("Duplicate entries in xlsx:\n", dxlsx[dxlsx.duplicated()])

# Drop duplicates
dcsv.drop_duplicates(inplace=True)
djson.drop_duplicates(inplace=True)
dxlsx.drop_duplicates(inplace=True)

# Concatenate dataframes
uni_df = pd.concat([dcsv, djson, dxlsx], ignore_index=True)
print(f"Combined dataset shape: {uni_df.shape}")

# Create a full name column
dcsv['full name'] = dcsv['first_name'] + ' ' + dcsv['last_name']
print(dcsv.head())

# Function to extract pin code from address
def extract_pin_code(address):
    match = re.search(r'\b\d{5}\b', address)
    return match.group(0) if match else None  # Return None if no match is found

# Apply the function to create a pin code column
dcsv['pin code'] = [extract_pin_code(add) for add in dcsv['address']]
print(dcsv.head())

# Describe the combined dataframe
print(uni_df.describe())

# Aggregate data by job category
job_summary = uni_df.groupby('job').agg({'orders': 'sum', 'spent': 'mean'})
print(job_summary)

# Calculate total sales
total_sales = uni_df['spent'].sum()
print(f"Total Sales: {total_sales}")

# Calculate average spending
average_spent = uni_df['spent'].mean()
print(f"Average Spending: {average_spent}")

# Count of each job category
job_counts = uni_df['job'].value_counts()
print(job_counts)

# Visualizations
# Bar plot of sales by job category
plt.figure(figsize=(12, 6))
sns.barplot(x='job', y='spent', data=uni_df)
plt.title('Sales by Job Category')
plt.xlabel('Job Category')
plt.ylabel('Total Sales')
plt.xticks(rotation=90)
plt.show()

# Pie chart for job category distribution
plt.figure(figsize=(10, 6))
product_distribution = uni_df['job'].value_counts()
plt.pie(product_distribution, labels=product_distribution.index, autopct='%1.2f%%', startangle=140)
plt.title('Job Category Distribution')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

# Box plot for order value distribution by job
plt.figure(figsize=(12, 6))
sns.boxplot(x='job', y='spent', data=uni_df)
plt.title('Order Value Distribution by Job')
plt.xlabel('Job')
plt.ylabel('Order Value')
plt.xticks(rotation=90)
plt.show()
  first_name last_name                               email  \
0     Joseph      Rice      josephrice131@slingacademy.com   
1       Gary     Moore       garymoore386@slingacademy.com   
2       John    Walker      johnwalker944@slingacademy.com   
3       Eric    Carter      ericcarter176@slingacademy.com   
4    William   Jackson  williamjackson427@slingacademy.com   

                  phone                                   address gender  age  \
0  +1-800-040-3135x6208    91773 Miller Shoal\nDiaztown, FL 38841   male   43   
1     221.945.4191x8872       6450 John Lodge\nTerriton, KY 95945   male   71   
2     388-142-4883x5370  27265 Murray Island\nKevinfort, PA 63231   male   44   
3         (451)259-5402                 USNS Knight\nFPO AA 76532   male   38   
4      625.626.9133x374   170 Jackson Loaf\nKristenland, AS 48876   male   58   

   registered  orders   spent       job                      hobbies  \
0  2019-05-05       7  568.29    Artist               Playing sports   
1  2020-05-20      11  568.92    Artist                     Swimming   
2  2020-04-04      11  497.12     Clerk                     Painting   
3  2019-01-30      17  834.60   Butcher  Playing musical instruments   
4  2022-07-01      14  151.59  Engineer                      Reading   

   is_married  
0       False  
1        True  
2       False  
3       False  
4       False  
   id                   email    first     last  \
0   1  isidro_von@hotmail.com   Torrey     Veum   
1   2  frederique19@gmail.com    Micah  Sanford   
2   3       fredy54@gmail.com   Hollis    Swift   
3   4   braxton29@hotmail.com    Perry  Leffler   
4   5      turner59@gmail.com  Janelle  Hagenes   

                          company                       created_at  \
0          Hilll, Mayert and Wolf 2014-12-25 04:06:27.981000+00:00   
1                  Stokes-Reichel 2014-07-03 16:08:17.044000+00:00   
2  Rodriguez, Cartwright and Kuhn 2014-08-18 06:15:16.731000+00:00   
3        Sipes, Feeney and Hansen 2014-07-10 11:31:40.235000+00:00   
4             Lesch and Daughters 2014-04-21 15:05:43.229000+00:00   

                                 country  
0                            Switzerland  
1  Democratic People's Republic of Korea  
2                                Tunisia  
3                                   Chad  
4                              Swaziland  
  first_name last_name                               email  \
0     Joseph      Rice      josephrice131@slingacademy.com   
1       Gary     Moore       garymoore386@slingacademy.com   
2       John    Walker      johnwalker944@slingacademy.com   
3       Eric    Carter      ericcarter176@slingacademy.com   
4    William   Jackson  williamjackson427@slingacademy.com   

                  phone                                   address gender  age  \
0  +1-800-040-3135x6208    91773 Miller Shoal\nDiaztown, FL 38841   male   43   
1     221.945.4191x8872       6450 John Lodge\nTerriton, KY 95945   male   71   
2     388-142-4883x5370  27265 Murray Island\nKevinfort, PA 63231   male   44   
3         (451)259-5402                 USNS Knight\nFPO AA 76532   male   38   
4      625.626.9133x374   170 Jackson Loaf\nKristenland, AS 48876   male   58   

  registered  orders   spent       job                      hobbies  \
0 2019-05-05       7  568.29    Artist               Playing sports   
1 2020-05-20      11  568.92    Artist                     Swimming   
2 2020-04-04      11  497.12     Clerk                     Painting   
3 2019-01-30      17  834.60   Butcher  Playing musical instruments   
4 2022-07-01      14  151.59  Engineer                      Reading   

   is_married  
0       False  
1        True  
2       False  
3       False  
4       False  
Missing values in csv
first_name    0
last_name     0
email         0
phone         0
address       0
gender        0
age           0
registered    0
orders        0
spent         0
job           0
hobbies       0
is_married    0
dtype: int64

Missing values in json
id            0
email         0
first         0
last          0
company       0
created_at    0
country       0
dtype: int64

Missing values in xlsx
first_name    0
last_name     0
email         0
phone         0
address       0
gender        0
age           0
registered    0
orders        0
spent         0
job           0
hobbies       0
is_married    0
dtype: int64
info of csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   first_name  1000 non-null   object 
 1   last_name   1000 non-null   object 
 2   email       1000 non-null   object 
 3   phone       1000 non-null   object 
 4   address     1000 non-null   object 
 5   gender      1000 non-null   object 
 6   age         1000 non-null   int64  
 7   registered  1000 non-null   object 
 8   orders      1000 non-null   int64  
 9   spent       1000 non-null   float64
 10  job         1000 non-null   object 
 11  hobbies     1000 non-null   object 
 12  is_married  1000 non-null   bool   
dtypes: bool(1), float64(1), int64(2), object(9)
memory usage: 94.9+ KB

info of json
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   id          9999 non-null   int64              
 1   email       9999 non-null   object             
 2   first       9999 non-null   object             
 3   last        9999 non-null   object             
 4   company     9999 non-null   object             
 5   created_at  9999 non-null   datetime64[ns, UTC]
 6   country     9999 non-null   object             
dtypes: datetime64[ns, UTC](1), int64(1), object(5)
memory usage: 546.9+ KB

info of xlsx
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   first_name  1000 non-null   object        
 1   last_name   1000 non-null   object        
 2   email       1000 non-null   object        
 3   phone       1000 non-null   object        
 4   address     1000 non-null   object        
 5   gender      1000 non-null   object        
 6   age         1000 non-null   int64         
 7   registered  1000 non-null   datetime64[ns]
 8   orders      1000 non-null   int64         
 9   spent       1000 non-null   float64       
 10  job         1000 non-null   object        
 11  hobbies     1000 non-null   object        
 12  is_married  1000 non-null   bool          
dtypes: bool(1), datetime64[ns](1), float64(1), int64(2), object(8)
memory usage: 94.9+ KB
Duplicate entries in csv:
 Empty DataFrame
Columns: [first_name, last_name, email, phone, address, gender, age, registered, orders, spent, job, hobbies, is_married]
Index: []
Duplicate entries in json:
 Empty DataFrame
Columns: [id, email, first, last, company, created_at, country]
Index: []
Duplicate entries in xlsx:
 Empty DataFrame
Columns: [first_name, last_name, email, phone, address, gender, age, registered, orders, spent, job, hobbies, is_married]
Index: []
Combined dataset shape: (11999, 19)
  first_name last_name                               email  \
0     Joseph      Rice      josephrice131@slingacademy.com   
1       Gary     Moore       garymoore386@slingacademy.com   
2       John    Walker      johnwalker944@slingacademy.com   
3       Eric    Carter      ericcarter176@slingacademy.com   
4    William   Jackson  williamjackson427@slingacademy.com   

                  phone                                   address gender  age  \
0  +1-800-040-3135x6208    91773 Miller Shoal\nDiaztown, FL 38841   male   43   
1     221.945.4191x8872       6450 John Lodge\nTerriton, KY 95945   male   71   
2     388-142-4883x5370  27265 Murray Island\nKevinfort, PA 63231   male   44   
3         (451)259-5402                 USNS Knight\nFPO AA 76532   male   38   
4      625.626.9133x374   170 Jackson Loaf\nKristenland, AS 48876   male   58   

   registered  orders   spent       job                      hobbies  \
0  2019-05-05       7  568.29    Artist               Playing sports   
1  2020-05-20      11  568.92    Artist                     Swimming   
2  2020-04-04      11  497.12     Clerk                     Painting   
3  2019-01-30      17  834.60   Butcher  Playing musical instruments   
4  2022-07-01      14  151.59  Engineer                      Reading   

   is_married        full name  
0       False      Joseph Rice  
1        True       Gary Moore  
2       False      John Walker  
3       False      Eric Carter  
4       False  William Jackson  
  first_name last_name                               email  \
0     Joseph      Rice      josephrice131@slingacademy.com   
1       Gary     Moore       garymoore386@slingacademy.com   
2       John    Walker      johnwalker944@slingacademy.com   
3       Eric    Carter      ericcarter176@slingacademy.com   
4    William   Jackson  williamjackson427@slingacademy.com   

                  phone                                   address gender  age  \
0  +1-800-040-3135x6208    91773 Miller Shoal\nDiaztown, FL 38841   male   43   
1     221.945.4191x8872       6450 John Lodge\nTerriton, KY 95945   male   71   
2     388-142-4883x5370  27265 Murray Island\nKevinfort, PA 63231   male   44   
3         (451)259-5402                 USNS Knight\nFPO AA 76532   male   38   
4      625.626.9133x374   170 Jackson Loaf\nKristenland, AS 48876   male   58   

   registered  orders   spent       job                      hobbies  \
0  2019-05-05       7  568.29    Artist               Playing sports   
1  2020-05-20      11  568.92    Artist                     Swimming   
2  2020-04-04      11  497.12     Clerk                     Painting   
3  2019-01-30      17  834.60   Butcher  Playing musical instruments   
4  2022-07-01      14  151.59  Engineer                      Reading   

   is_married        full name pin code  
0       False      Joseph Rice    91773  
1        True       Gary Moore    95945  
2       False      John Walker    27265  
3       False      Eric Carter    76532  
4       False  William Jackson    48876  
               age      orders        spent           id
count  2000.000000  2000.00000  2000.000000  9999.000000
mean     48.145000    10.03600   496.295100  5000.000000
std      18.347869     6.17412   289.156882  2886.607005
min      18.000000     0.00000     0.490000     1.000000
25%      32.000000     5.00000   248.100000  2500.500000
50%      47.000000    10.00000   469.045000  5000.000000
75%      64.000000    15.00000   739.660000  7499.500000
max      80.000000    20.00000   999.150000  9999.000000
                orders       spent
job                               
Accountant       672.0  480.695882
Architect        436.0  549.016667
Artist           394.0  576.194583
Athlete          604.0  503.059630
Baker            654.0  469.264194
Barber           692.0  453.617143
Butcher          650.0  430.244687
Carpenter        314.0  521.049000
Cashier          554.0  541.296538
Chef             700.0  501.610937
Clerk            508.0  486.912083
Dentist          600.0  498.399667
Doctor           618.0  498.835000
Engineer         606.0  485.854231
Farmer           418.0  417.005217
Firefighter      530.0  421.117692
Hairdresser      552.0  474.699259
Housewife        524.0  470.946154
Janitor          640.0  528.832000
Lawyer           698.0  524.990000
Librarian        272.0  445.107143
Mechanic         590.0  547.160606
Nurse            502.0  430.046296
Officer          388.0  553.467000
Pilot            630.0  495.528750
Polic            492.0  418.494400
Politician       420.0  561.991739
Receptionist     560.0  561.750294
Scientist        432.0  475.320476
Security Guard   510.0  535.684815
Student          558.0  522.103077
Tailor           676.0  463.127059
Teacher          520.0  473.012500
Unemployed       678.0  433.800000
Unkown           400.0  579.054091
Waiter           478.0  572.398400
Waitress         602.0  475.518889
Total Sales: 992590.2
Average Spending: 496.2951
job
Lawyer            74
Barber            70
Tailor            68
Receptionist      68
Accountant        68
Mechanic          66
Chef              64
Butcher           64
Doctor            64
Baker             62
Dentist           60
Janitor           60
Unemployed        56
Hairdresser       54
Nurse             54
Athlete           54
Security Guard    54
Waitress          54
Cashier           52
Student           52
Firefighter       52
Housewife         52
Engineer          52
Polic             50
Waiter            50
Architect         48
Teacher           48
Pilot             48
Clerk             48
Artist            48
Farmer            46
Politician        46
Unkown            44
Scientist         42
Officer           40
Carpenter         40
Librarian         28
Name: count, dtype: int64
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
''' Pract 8 - Interacting with Web APIs
Problem Statement: Analyzing Weather Data from OpenWeatherMap API
Dataset: Weather data retrieved from OpenWeatherMap API
Description: The goal is to interact with the OpenWeatherMap API to retrieve weather data
for a specific location and perform data modeling and visualization to analyze weather
patterns over time.
Tasks to Perform:
1. Register and obtain API key from OpenWeatherMap.
2. Interact with the OpenWeatherMap API using the API key to retrieve weather data for
a specific location.
3. Extract relevant weather attributes such as temperature, humidity, wind speed, and
precipitation from the API response.
4. Clean and preprocess the retrieved data, handling missing values or inconsistent
formats.
5. Perform data modeling to analyze weather patterns, such as calculating average
temperature, maximum/minimum values, or trends over time.
6. Visualize the weather data using appropriate plots, such as line charts, bar plots, or
scatter plots, to represent temperature changes, precipitation levels, or wind speed
variations.
7. Apply data aggregation techniques to summarize weather statistics by specific time
periods (e.g., daily, monthly, seasonal).
8. Incorporate geographical information, if available, to create maps or geospatial
visualizations representing weather patterns across different locations.
9. Explore and visualize relationships between weather attributes, such as temperature
and humidity, using correlation plots or heatmaps.
In [ ]:
 
In [10]:
import requests 
import pandas as pd
import datetime
#default api_key = a4ddc1eec173ae7ce62ad20c1d4c51f3
# api url =  f"http://api.openweathermap.org/data/2.5/weather?q=London,uk&APPID=a4ddc1eec173ae7ce62ad20c1d4c51f3


# Set your OpenWeatherMap API key

#default api key = a16fb0accb4b396705eef57c4fa5f0ca 
# api url demo = f"http://api.openweathermap.org/data/2.5/forecast?lat={lat}&lon={lon}&appid={api_key}"

api_key = 'fb365aa6104829b44455572365ff3b4e' 

# Set the location for which you want to retrieve weather data 
lat = 18.184135
lon = 74.610764

# https://openweathermap.org/api/one-call-3
# how	How to use api call 
# Construct the API URL
api_url = f"http://api.openweathermap.org/data/2.5/forecast?lat={lat}&lon={lon}&appid={api_key}"

# Send a GET request to the API 
response = requests.get(api_url)
weather_data = response.json()
weather_data.keys()
len(weather_data['list'])
weather_data['list'][0]['weather'][0]['description']


#Getting the data from dictionary and taking into one variable 
# Extract relevant weather attributes using list comprehension
temperatures = [item['main']['temp'] for item in weather_data['list']] 

# It will extract all values (40) and putting into one variable
timestamps = [pd.to_datetime(item['dt'], unit='s') for item in weather_data['list']]
temperature = [item['main']['temp'] for item in weather_data['list']]
humidity = [item['main']['humidity'] for item in weather_data['list']]
wind_speed = [item['wind']['speed'] for item in weather_data['list']]
weather_description = [item['weather'][0]['description'] for item in weather_data['list']]


# Create a pandas DataFrame with the extracted weather data
weather_df = pd.DataFrame({'Timestamp': timestamps, 
                           'Temperature': temperatures, 
                           'humidity': humidity, 
                           'wind_speed':wind_speed,
                           'weather_description': weather_description})


# Set the Timestamp column as the DataFrame's index
weather_df.set_index('Timestamp', inplace=True)
max_temp = weather_df['Temperature'].max()
print(f"Maximum Temperature - {max_temp}")
min_temp = weather_df['Temperature'].min()
print(f"Minimum Temperature - {min_temp}")


# Clean and preprocess the data # Handling missing values
weather_df.fillna(0, inplace=True) # Replace missing values with 0 or appropriate value

# Handling inconsistent format (if applicable)
weather_df['Temperature'] = weather_df['Temperature'].apply(lambda x: x - 273.15 if isinstance(x, float)else x)

# Convert temperature from Kelvin to Celsius
# Print the cleaned and preprocessed data print(weather_df)
weather_df.head()

import matplotlib.pyplot as plt
daily_mean_temp = weather_df['Temperature'].resample('D').mean()
daily_mean_humidity = weather_df['humidity'].resample('D').mean()
daily_mean_wind_speed = weather_df['wind_speed'].resample('D').mean()


# Plot the mean daily temperature over time (Line plot)
plt.figure(figsize=(10, 6))
daily_mean_temp.plot(color='red', linestyle='-', marker='o')
plt.title('Mean Daily Temperature')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.grid(True)
plt.show()

# Plot the mean daily humidity over time (Bar plot)
plt.figure(figsize=(10, 6))
daily_mean_humidity.plot(kind='bar', color='blue')
plt.title('Mean Daily Humidity')
plt.xlabel('Date')
plt.ylabel('Humidity (%)')
plt.grid(True)
plt.show()


# Plot the relationship between temperature and wind speed (Scatter plot)
plt.figure(figsize=(10, 6))
plt.scatter(weather_df['Temperature'], weather_df['wind_speed'], color='green')
plt.title('Temperature vs. Wind Speed')
plt.xlabel('Temperature (°C)')
plt.ylabel('Wind Speed (m/s)')
plt.grid(True)
plt.show()


# Heatmap
import seaborn as sns
heatmap_data = weather_df[['Temperature', 'humidity']]
sns.heatmap(heatmap_data, annot=True, cmap='coolwarm')
plt.title('Temperature vs Humidity Heatmap')
plt.show()


# Create a scatter plot to visualize the relationship between temperature and humidity
plt.scatter(weather_df['Temperature'], weather_df['humidity'])
plt.xlabel('Temperature (°C)')
plt.ylabel('Humidity (%)')
plt.title('Temperature vs Humidity Scatter Plot')
plt.show()
Maximum Temperature - 305.17
Minimum Temperature - 293.16
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
 
In [ ]:
#OR 
In [ ]:
pip install folium
In [7]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import folium
import seaborn as sns

# Step 1: Register and obtain API key from OpenWeatherMap
api_key = "fb365aa6104829b44455572365ff3b4e"

# Step 2: Interact with the OpenWeatherMap API
base_url = "http://api.openweathermap.org/data/2.5/forecast"
city_name = "New York"
params = {
    "q": city_name,
    "appid": api_key,
    "units": "metric"
}
response = requests.get(base_url, params=params)
data = response.json()

response 

data

# Extract relevant weather attributes and timestamps
weather_data = data["list"]
timestamps = [entry["dt"] for entry in weather_data]
temperatures = [entry["main"]["temp"] for entry in weather_data]
humidity = [entry["main"]["humidity"] for entry in weather_data]
wind_speed = [entry["wind"]["speed"] for entry in weather_data]
weather_description = [item['weather'][0]['description'] for item in weather_data]

weather_data 



# Convert timestamps to pandas datetime objects
datetime_objects = [pd.to_datetime(timestamp, unit="s") for timestamp in timestamps]

# Create a pandas DataFrame with the extracted weather data
weather_df = pd.DataFrame({
    'Timestamp': timestamps,
    'Datetime': datetime_objects,
    'Temperature': temperatures,
    'Humidity': humidity,
    'Wind Speed': wind_speed,
    'Weather Description': weather_description,
})

# Convert the "Datetime" column to a DatetimeIndex
weather_df.set_index("Datetime", inplace=True)

weather_df

# Calculate total precipitation (rain + snow) for each entry
precipitation = []
for entry in weather_data:
    rain = entry.get("rain", {}).get("3h", 0)
    snow = entry.get("snow", {}).get("3h", 0)
    total_precipitation = rain + snow
    precipitation.append(total_precipitation)

precipitation

# Step 5: Perform data modeling
avg_temp = sum(temperatures) / len(temperatures)
max_temp = max(temperatures)
min_temp = min(temperatures)

# Plot the mean daily temperature over time (Line plot)
daily_mean_temp = weather_df['Temperature'].resample('D').mean()
daily_mean_humidity = weather_df['Humidity'].resample('D').mean()
daily_mean_wind_speed = weather_df['Wind Speed'].resample('D').mean()

plt.figure(figsize=(10, 6))
daily_mean_temp.plot(color='red', linestyle='-', marker='o')
plt.title('Mean Daily Temperature')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.grid(True)
plt.show()

# Plot the mean daily humidity over time (Bar plot)
plt.figure(figsize=(10, 6))
daily_mean_humidity.plot(kind='bar', color='blue')
plt.title('Mean Daily Humidity')
plt.xlabel('Date')
plt.ylabel('Humidity (%)')
plt.grid(True)
plt.show()

# Plot the relationship between temperature and wind speed (Scatter plot)
plt.figure(figsize=(10, 6))
plt.scatter(weather_df['Temperature'], weather_df['Wind Speed'], color='green')
plt.title('Temperature vs. Wind Speed')
plt.xlabel('Temperature (°C)')
plt.ylabel('Wind Speed (m/s)')
plt.grid(True)
plt.show()

# Step 6: Visualize the weather data
plt.figure(figsize=(10, 6))
plt.plot(timestamps, temperatures, label="Temperature (°C)")
plt.plot(timestamps, precipitation, label="Precipitation")
plt.xlabel("Timestamp")
plt.ylabel("Value")
plt.title(f"Weather Data for {city_name}")
plt.legend()
plt.show()

# Create a heatmap of temperature vs humidity
heatmap_data = weather_df[['Temperature', 'Humidity']]
sns.heatmap(heatmap_data, annot=True, cmap='coolwarm')
plt.title('Temperature vs Humidity Heatmap')
plt.show()

# Scatter plot to visualize the relationship between temperature and humidity
plt.scatter(weather_df['Temperature'], weather_df['Humidity'])
plt.xlabel('Temperature (°C)')
plt.ylabel('Humidity (%)')
plt.title('Temperature vs Humidity Scatter Plot')
plt.show()

# Step 7: Retrieve weather data for multiple locations
locations = ['London', 'Paris', 'New York']
multi_location_df = pd.DataFrame()

for location in locations:
    api_url = f'http://api.openweathermap.org/data/2.5/weather?q={location}&appid={api_key}'
    response = requests.get(api_url)
    data = response.json()

    # Extract relevant weather attributes
    temperature = data['main']['temp']
    humidity = data['main']['humidity']
    wind_speed = data['wind']['speed']
    latitude = data.get('coord', {}).get('lat', None)
    longitude = data.get('coord', {}).get('lon', None)

    # Create a DataFrame for the current location's weather data
    location_df = pd.DataFrame({
        'Location': [location],
        'Temperature': [temperature],
        'Humidity': [humidity],
        'Wind Speed': [wind_speed],
        'Latitude': [latitude],
        'Longitude': [longitude]
    })

    # Append the current location's weather data to the main DataFrame
    multi_location_df = pd.concat([multi_location_df, location_df], ignore_index=True)

# Display the DataFrame with weather data for multiple locations
print(multi_location_df)

# Step 8: Geospatial Visualization with Folium
# Center the map around New York (or the first location)
latitude = multi_location_df.iloc[0]['Latitude']
longitude = multi_location_df.iloc[0]['Longitude']
weather_map = folium.Map(location=[latitude, longitude], zoom_start=4)

# Add markers for each location in the DataFrame
for index, row in multi_location_df.iterrows():
    location_name = row['Location']
    latitude = row['Latitude']
    longitude = row['Longitude']
    folium.Marker([latitude, longitude], popup=location_name).add_to(weather_map)

# Save the map as an HTML file
weather_map.save("weather_map.html")

print("Map saved as 'weather_map.html'.")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
   Location  Temperature  Humidity  Wind Speed  Latitude  Longitude
0    London       284.68        83        4.12   51.5085    -0.1257
1     Paris       282.20        90        4.12   48.8534     2.3488
2  New York       285.31        34        4.12   40.7143   -74.0060
Map saved as 'weather_map.html'.
In [ ]:
 
In [ ]:
#OR 
In [22]:
#Own code

import requests
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns

# Set your OpenWeatherMap API key
api_key = 'a4ddc1eec173ae7ce62ad20c1d4c51f3'  # Replace with your actual API key
lat = 18.184135  # Latitude of the location
lon = 74.610764  # Longitude of the location

# API URL for forecast data
api_url = f"http://api.openweathermap.org/data/2.5/forecast?lat={lat}&lon={lon}&appid={api_key}"

# Send a GET request to the API
response = requests.get(api_url)
weather_data = response.json()

# Ensure 'list' exists in the response
if 'list' in weather_data:
    # Extract relevant weather attributes using list comprehension
    temperatures = [item['main']['temp'] - 273.15 for item in weather_data['list']]  # Convert from Kelvin to Celsius
    timestamps = [pd.to_datetime(item['dt'], unit='s') for item in weather_data['list']]
    humidity = [item['main']['humidity'] for item in weather_data['list']]
    wind_speed = [item['wind']['speed'] for item in weather_data['list']]
    weather_description = [item['weather'][0]['description'] for item in weather_data['list']]

    # Create a pandas DataFrame
    weather_df = pd.DataFrame({
        'Timestamp': timestamps, 
        'Temperature': temperatures, 
        'Humidity': humidity, 
        'Wind Speed': wind_speed,
        'Weather Description': weather_description
    })

    # Set the Timestamp column as the DataFrame's index
    weather_df.set_index('Timestamp', inplace=True)

    # Print the cleaned DataFrame
    print(weather_df.head())

    # Calculate max and min temperatures
    max_temp = weather_df['Temperature'].max()
    print(f"Maximum Temperature: {max_temp:.2f}°C")
    min_temp = weather_df['Temperature'].min()
    print(f"Minimum Temperature: {min_temp:.2f}°C")

    # Clean and preprocess the data
    weather_df.fillna(0, inplace=True)  # Replace missing values with 0 or appropriate value

    # Plot the mean daily temperature over time (Line plot)
    daily_mean_temp = weather_df['Temperature'].resample('D').mean()
    plt.figure(figsize=(10, 6))
    daily_mean_temp.plot(color='red', linestyle='-', marker='o')
    plt.title('Mean Daily Temperature')
    plt.xlabel('Date')
    plt.ylabel('Temperature (°C)')
    plt.grid(True)
    plt.show()

    # Plot the mean daily humidity over time (Bar plot)
    daily_mean_humidity = weather_df['Humidity'].resample('D').mean()
    plt.figure(figsize=(10, 6))
    daily_mean_humidity.plot(kind='bar', color='blue')
    plt.title('Mean Daily Humidity')
    plt.xlabel('Date')
    plt.ylabel('Humidity (%)')
    plt.grid(True)
    plt.show()

    # Plot the relationship between temperature and wind speed (Scatter plot)
    plt.figure(figsize=(10, 6))
    plt.scatter(weather_df['Temperature'], weather_df['Wind Speed'], color='green')
    plt.title('Temperature vs. Wind Speed')
    plt.xlabel('Temperature (°C)')
    plt.ylabel('Wind Speed (m/s)')
    plt.grid(True)
    plt.show()

    # Heatmap of Temperature vs Humidity
    heatmap_data = weather_df[['Temperature', 'Humidity']]
    plt.figure(figsize=(10, 6))
    sns.heatmap(heatmap_data, annot=True, cmap='coolwarm')
    plt.title('Temperature vs Humidity Heatmap')
    plt.show()

    # Scatter plot to visualize the relationship between temperature and humidity
    plt.figure(figsize=(10, 6))
    plt.scatter(weather_df['Temperature'], weather_df['Humidity'])
    plt.xlabel('Temperature (°C)')
    plt.ylabel('Humidity (%)')
    plt.title('Temperature vs Humidity Scatter Plot')
    plt.grid(True)
    plt.show()
else:
    print("No weather data available.")
                     Temperature  Humidity  Wind Speed Weather Description
Timestamp                                                                 
2024-11-04 18:00:00        24.47        49        3.18       broken clouds
2024-11-04 21:00:00        23.93        51        2.52    scattered clouds
2024-11-05 00:00:00        22.40        55        1.89    scattered clouds
2024-11-05 03:00:00        24.66        50        2.54           clear sky
2024-11-05 06:00:00        29.46        37        3.93           clear sky
Maximum Temperature: 31.40°C
Minimum Temperature: 20.29°C
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
'''Pract 9 - 9 Data Cleaning and Preparation
Problem Statement: Analyzing Customer Churn in a Telecommunications Company
Dataset: "Telecom_Customer_Churn.csv"
Description: The dataset contains information about customers of a telecommunications
company and whether they have churned (i.e., discontinued their services). The dataset
includes various attributes of the customers, such as their demographics, usage patterns, and
account information. The goal is to perform data cleaning and preparation to gain insights
into the factors that contribute to customer churn.
Tasks to Perform:
1. Import the "Telecom_Customer_Churn.csv" dataset.
2. Explore the dataset to understand its structure and content.
3. Handle missing values in the dataset, deciding on an appropriate strategy.
4. Remove any duplicate records from the dataset.
5. Check for inconsistent data, such as inconsistent formatting or spelling variations,
and standardize it.
6. Convert columns to the correct data types as needed.
7. Identify and handle outliers in the data.
8. Perform feature engineering, creating new features that may be relevant to
predicting customer churn.
9. Normalize or scale the data if necessary.
Faculty of Science and Technology Savitribai Phule Pune University
Syllabus for Fourth Year of Artificial Intelligence and Data Science (2020 Course) 47/126
10. Split the dataset into training and testing sets for further analysis.
11. Export the cleaned dataset for future analysis or modeling.
In [ ]:
 
In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt


data = pd.read_csv("Telco-Customer-Churn.csv")
print(data.index)

data.head()

print(data.columns)


data.shape 

data.nunique()

data.isna().sum()

data.isnull().sum()

# Check the number of rows before removing duplicates 
print("Number of rows before removing duplicates:", len(data))

# Remove duplicate records
data_cleaned = data.drop_duplicates()

# Remove duplicate records
data_cleaned = data.drop_duplicates()

data.describe()

# Measure of frequency destribution
unique, counts = np.unique(data['tenure'], return_counts=True) 
print(unique, counts)

# Measure of frequency destribution
unique, counts = np.unique(data['MonthlyCharges'], return_counts=True) 
print(unique, counts)

# Measure of frequency destribution
unique, counts = np.unique(data['TotalCharges'], return_counts=True) 
print(unique, counts)

sns.pairplot(data)

plt.boxplot(data['tenure'])
plt.show()

plt.boxplot(data['MonthlyCharges']) 
plt.show()

X = data.drop("Churn", axis=1) 
y = data["Churn"]


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape 

y_train.shape 

X_test.shape 

y_test.shape 

# Export the cleaned dataset to a CSV file 
data.to_csv("Cleaned_Telecom_Customer_Churn.csv", index=False)
RangeIndex(start=0, stop=7043, step=1)
Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')
Number of rows before removing duplicates: 7043
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72] [ 11 613 238 200 176 133 110 131 123 119 116  99 117 109  76  99  80  87
  97  73  71  63  90  85  94  79  79  72  57  72  72  65  69  64  65  88
  50  65  59  56  64  70  65  65  51  61  74  68  64  66  68  68  80  70
  68  64  80  65  67  60  76  76  70  72  80  76  89  98 100  95 119 170
 362]
[ 18.25  18.4   18.55 ... 118.6  118.65 118.75] [1 1 1 ... 2 1 1]
[' ' '100.2' '100.25' ... '999.45' '999.8' '999.9'] [11  1  1 ...  1  1  1]
No description has been provided for this image
No description has been provided for this image
In [ ]:
 
In [ ]:
#OR 
In [15]:
import pandas as pd #data manipulation
import numpy as np #numerical computations
from sklearn.model_selection import train_test_split # scikit-learn fo
from sklearn import metrics #evaluating the performance of

data = pd.read_csv("Telco-Customer-Churn.csv")
print(data.index)


print(data)

print(data.columns)

data.shape

print(data.head())


print(data.tail())


data.nunique()

# data.isna().sum() is used to count the number of missing values (NaN values) in e
data.isna().sum()

# isna() and isnull() are essentially the same method in Pandas, and they both retu
data.isnull().sum()

# Check the number of rows before removing duplicates
print("Number of rows before removing duplicates:", len(data))



# Remove duplicate records
data_cleaned = data.drop_duplicates()

# Check the number of rows after removing duplicates
print("Number of rows after removing duplicates:", len(data_cleaned))



data.describe()

#Measure of frequency destribution
unique, counts = np.unique(data['tenure'], return_counts=True)
print(unique, counts)

#Measure of frequency destribution
unique, counts = np.unique(data['MonthlyCharges'], return_counts=True)
print(unique, counts)

#Measure of frequency destribution
unique, counts = np.unique(data['TotalCharges'], return_counts=True)
print(unique, counts)

# sns.pairplot(data) creates a grid of pairwise plots of the variables in a dataset
import seaborn as sns #Seaborn library for data visualization
sns.pairplot(data)

#checking boxplot for Fare column
import matplotlib.pyplot as plt #pyplot module from the Matplotlib lib
plt.boxplot(data['tenure'])
plt.show()

plt.boxplot(data['MonthlyCharges'])
plt.show()

X = data.drop("Churn", axis=1)
y = data["Churn"]
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#Export the cleaned data



X_train.shape



y_train.shape



X_test.shape



y_test.shape
                                                    

# Export the cleaned dataset to a CSV file
data.to_csv("Cleaned_Telecom_Customer_Churn.csv", index=False)  
RangeIndex(start=0, stop=7043, step=1)
      customerID  gender  SeniorCitizen Partner Dependents  tenure  \
0     7590-VHVEG  Female              0     Yes         No       1   
1     5575-GNVDE    Male              0      No         No      34   
2     3668-QPYBK    Male              0      No         No       2   
3     7795-CFOCW    Male              0      No         No      45   
4     9237-HQITU  Female              0      No         No       2   
...          ...     ...            ...     ...        ...     ...   
7038  6840-RESVB    Male              0     Yes        Yes      24   
7039  2234-XADUH  Female              0     Yes        Yes      72   
7040  4801-JZAZL  Female              0     Yes        Yes      11   
7041  8361-LTMKD    Male              1     Yes         No       4   
7042  3186-AJIEK    Male              0      No         No      66   

     PhoneService     MultipleLines InternetService OnlineSecurity  ...  \
0              No  No phone service             DSL             No  ...   
1             Yes                No             DSL            Yes  ...   
2             Yes                No             DSL            Yes  ...   
3              No  No phone service             DSL            Yes  ...   
4             Yes                No     Fiber optic             No  ...   
...           ...               ...             ...            ...  ...   
7038          Yes               Yes             DSL            Yes  ...   
7039          Yes               Yes     Fiber optic             No  ...   
7040           No  No phone service             DSL            Yes  ...   
7041          Yes               Yes     Fiber optic             No  ...   
7042          Yes                No     Fiber optic            Yes  ...   

     DeviceProtection TechSupport StreamingTV StreamingMovies        Contract  \
0                  No          No          No              No  Month-to-month   
1                 Yes          No          No              No        One year   
2                  No          No          No              No  Month-to-month   
3                 Yes         Yes          No              No        One year   
4                  No          No          No              No  Month-to-month   
...               ...         ...         ...             ...             ...   
7038              Yes         Yes         Yes             Yes        One year   
7039              Yes          No         Yes             Yes        One year   
7040               No          No          No              No  Month-to-month   
7041               No          No          No              No  Month-to-month   
7042              Yes         Yes         Yes             Yes        Two year   

     PaperlessBilling              PaymentMethod MonthlyCharges  TotalCharges  \
0                 Yes           Electronic check          29.85         29.85   
1                  No               Mailed check          56.95        1889.5   
2                 Yes               Mailed check          53.85        108.15   
3                  No  Bank transfer (automatic)          42.30       1840.75   
4                 Yes           Electronic check          70.70        151.65   
...               ...                        ...            ...           ...   
7038              Yes               Mailed check          84.80        1990.5   
7039              Yes    Credit card (automatic)         103.20        7362.9   
7040              Yes           Electronic check          29.60        346.45   
7041              Yes               Mailed check          74.40         306.6   
7042              Yes  Bank transfer (automatic)         105.65        6844.5   

     Churn  
0       No  
1       No  
2      Yes  
3       No  
4      Yes  
...    ...  
7038    No  
7039    No  
7040    No  
7041   Yes  
7042    No  

[7043 rows x 21 columns]
Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract PaperlessBilling  \
0          No          No              No  Month-to-month              Yes   
1          No          No              No        One year               No   
2          No          No              No  Month-to-month              Yes   
3         Yes          No              No        One year               No   
4          No          No              No  Month-to-month              Yes   

               PaymentMethod MonthlyCharges  TotalCharges Churn  
0           Electronic check          29.85         29.85    No  
1               Mailed check          56.95        1889.5    No  
2               Mailed check          53.85        108.15   Yes  
3  Bank transfer (automatic)          42.30       1840.75    No  
4           Electronic check          70.70        151.65   Yes  

[5 rows x 21 columns]
      customerID  gender  SeniorCitizen Partner Dependents  tenure  \
7038  6840-RESVB    Male              0     Yes        Yes      24   
7039  2234-XADUH  Female              0     Yes        Yes      72   
7040  4801-JZAZL  Female              0     Yes        Yes      11   
7041  8361-LTMKD    Male              1     Yes         No       4   
7042  3186-AJIEK    Male              0      No         No      66   

     PhoneService     MultipleLines InternetService OnlineSecurity  ...  \
7038          Yes               Yes             DSL            Yes  ...   
7039          Yes               Yes     Fiber optic             No  ...   
7040           No  No phone service             DSL            Yes  ...   
7041          Yes               Yes     Fiber optic             No  ...   
7042          Yes                No     Fiber optic            Yes  ...   

     DeviceProtection TechSupport StreamingTV StreamingMovies        Contract  \
7038              Yes         Yes         Yes             Yes        One year   
7039              Yes          No         Yes             Yes        One year   
7040               No          No          No              No  Month-to-month   
7041               No          No          No              No  Month-to-month   
7042              Yes         Yes         Yes             Yes        Two year   

     PaperlessBilling              PaymentMethod MonthlyCharges  TotalCharges  \
7038              Yes               Mailed check          84.80        1990.5   
7039              Yes    Credit card (automatic)         103.20        7362.9   
7040              Yes           Electronic check          29.60        346.45   
7041              Yes               Mailed check          74.40         306.6   
7042              Yes  Bank transfer (automatic)         105.65        6844.5   

     Churn  
7038    No  
7039    No  
7040    No  
7041   Yes  
7042    No  

[5 rows x 21 columns]
Number of rows before removing duplicates: 7043
Number of rows after removing duplicates: 7043
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72] [ 11 613 238 200 176 133 110 131 123 119 116  99 117 109  76  99  80  87
  97  73  71  63  90  85  94  79  79  72  57  72  72  65  69  64  65  88
  50  65  59  56  64  70  65  65  51  61  74  68  64  66  68  68  80  70
  68  64  80  65  67  60  76  76  70  72  80  76  89  98 100  95 119 170
 362]
[ 18.25  18.4   18.55 ... 118.6  118.65 118.75] [1 1 1 ... 2 1 1]
[' ' '100.2' '100.25' ... '999.45' '999.8' '999.9'] [11  1  1 ...  1  1  1]
No description has been provided for this image
No description has been provided for this image
In [ ]:
 
In [ ]:
'''Pract 10 - 10 Data Wrangling
Problem Statement: Data Wrangling on Real Estate Market
Dataset: "RealEstate_Prices.csv"
Description: The dataset contains information about housing prices in a specific real estate
market. It includes various attributes such as property characteristics, location, sale prices,
and other relevant features. The goal is to perform data wrangling to gain insights into the
factors influencing housing prices and prepare the dataset for further analysis or modeling.
Tasks to Perform:
1. Import the "RealEstate_Prices.csv" dataset. Clean column names by removing spaces,
special characters, or renaming them for clarity.
2. Handle missing values in the dataset, deciding on an appropriate strategy (e.g.,
imputation or removal).
3. Perform data merging if additional datasets with relevant information are available
(e.g., neighborhood demographics or nearby amenities).
4. Filter and subset the data based on specific criteria, such as a particular time period,
property type, or location.
5. Handle categorical variables by encoding them appropriately (e.g., one-hot encoding or
label encoding) for further analysis.
6. Aggregate the data to calculate summary statistics or derived metrics such as average
sale prices by neighborhood or property type.
7. Identify and handle outliers or extreme values in the data that may affect the analysis
or modeling process
In [4]:
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt
import warnings


# Supressing update warnings
warnings.filterwarnings('ignore')

df1 = pd.read_csv("Bengaluru_House_Data.csv") 


df1.head()

df1.shape 

df1.columns

df1['area_type']

df1['area_type'].unique()

df1['area_type'].value_counts()

df2 = df1.drop(['area_type','society','balcony','availability'],axis='columns') 

df2.shape

df2.isnull().sum()

df2.shape 

df3 = df2.dropna() 
df3.isnull().sum()

df3.shape 

df3['size'].unique()

df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))

df3.head()

df3.bhk.unique()

df3[df3.bhk>20]

df3.total_sqft.unique()

def is_float(x):
    try:
        float(x) 
        return True
    except(ValueError, TypeError):
        return False 

df3[~df3['total_sqft'].apply(is_float)].head(10)


def convert_sqft_to_num(x): 
    tokens = x.split('-')
    if len(tokens) == 2:
        try:
            return (float(tokens[0])+float(tokens[1]))/2
        except ValueError:
            return None
    try:
        return float(x) 
    except ValueError:
        return None 
    
result = convert_sqft_to_num('2100 - 2850')
print(result)


convert_sqft_to_num('34.46Sq. Meter') 
df4 = df3.copy()
df4.total_sqft = df4.total_sqft.apply(convert_sqft_to_num) 
df4

df4 = df4[df4.total_sqft.notnull()] 
df4

df4.loc[30]

df5 = df4.copy()
df5['price_per_sqft'] = df5['price']*100000/df5['total_sqft'] 
df5.head()

df5_stats = df5['price_per_sqft'].describe() 
df5_stats

df5.to_csv("bhp.csv",index=False)

df5.location = df5.location.apply(lambda x: x.strip()) 
location_stats = df5['location'].value_counts(ascending=False) 
location_stats

len(location_stats[location_stats>10])

len(location_stats) 

len(location_stats[location_stats<=10]) 

location_stats_less_than_10 = location_stats[location_stats<=10] 
location_stats_less_than_10

len(df5.location.unique())

df5.location = df5.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x) 
len(df5.location.unique())

df5.head(10)

df5[df5.total_sqft/df5.bhk<300].head()

df5.shape 

df6 = df5[~(df5.total_sqft/df5.bhk<300)] 
df6.shape

df6.columns

plt.boxplot(df6['total_sqft']) 
plt.show()


Q1 = np.percentile(df6['total_sqft'], 25.) # 25th percentile of the data of the given feature 
Q3 = np.percentile(df6['total_sqft'], 75.) # 75th percentile of the data of the given feature 
IQR = Q3-Q1 #Interquartile Range
ll = Q1 - (1.5*IQR) 
ul = Q3 + (1.5*IQR)
upper_outliers = df6[df6['total_sqft'] > ul].index.tolist() 
lower_outliers = df6[df6['total_sqft'] < ll].index.tolist() 
bad_indices = list(set(upper_outliers + lower_outliers)) 
drop = True
if drop:
    df6.drop(bad_indices, inplace = True, errors = 'ignore')

plt.boxplot(df6['bath']) 
plt.show()


Q1 = np.percentile(df6['bath'], 25.) # 25th percentile of the data of the given feature 
Q3 = np.percentile(df6['bath'], 75.) # 75th percentile of the data of the given feature 
IQR = Q3-Q1 #Interquartile Range
ll = Q1 - (1.5*IQR) 
ul = Q3 + (1.5*IQR)
upper_outliers = df6[df6['bath'] > ul].index.tolist() 
lower_outliers = df6[df6['bath'] < ll].index.tolist() 
bad_indices = list(set(upper_outliers + lower_outliers)) 
drop = True
if drop:
    df6.drop(bad_indices, inplace = True, errors = 'ignore')
plt.boxplot(df6['price']) 
plt.show()



Q1 = np.percentile(df6['price'], 25.) # 25th percentile of the data of the given feature 
Q3 = np.percentile(df6['price'], 75.) # 75th percentile of the data of the given feature 
IQR = Q3-Q1 #Interquartile Range
ll = Q1 - (1.5*IQR) 
ul = Q3 + (1.5*IQR)
 
upper_outliers = df6[df6['price'] > ul].index.tolist() 
lower_outliers = df6[df6['price'] < ll].index.tolist() 
bad_indices = list(set(upper_outliers + lower_outliers)) 
drop = True
if drop:
    df6.drop(bad_indices, inplace = True, errors = 'ignore')

plt.boxplot(df6['bhk']) 
plt.show()



Q1 = np.percentile(df6['bhk'], 25.) # 25th percentile of the data of the given feature 
Q3 = np.percentile(df6['bhk'], 75.) # 75th percentile of the data of the given feature 
IQR = Q3-Q1 #Interquartile Range
ll = Q1 - (1.5*IQR) 
ul = Q3 + (1.5*IQR)
upper_outliers = df6[df6['bhk'] > ul].index.tolist() 
lower_outliers = df6[df6['bhk'] < ll].index.tolist() 
bad_indices = list(set(upper_outliers + lower_outliers)) 
drop = True
if drop:
    df6.drop(bad_indices, inplace = True, errors = 'ignore')

plt.boxplot(df6['price_per_sqft']) 
plt.show()



Q1 = np.percentile(df6['price_per_sqft'], 25.) # 25th percentile of the data of the given feature 
Q3 = np.percentile(df6['price_per_sqft'], 75.) # 75th percentile of the data of the given feature 
IQR = Q3-Q1 #Interquartile Range
ll = Q1 - (1.5*IQR) 
ul = Q3 + (1.5*IQR)
upper_outliers = df6[df6['price_per_sqft'] > ul].index.tolist() 
lower_outliers = df6[df6['price_per_sqft'] < ll].index.tolist() 
bad_indices = list(set(upper_outliers + lower_outliers))
drop = True 
if drop:
    df6.drop(bad_indices, inplace = True, errors = 'ignore')

plt.boxplot(df6['price_per_sqft']) 
plt.show()



df6.shape

X = df6.drop(['price'],axis='columns') 
X.head(3)

X.shape 

y = df6.price 
y.head(3)

len(y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

X_train.shape 


y_train.shape 


X_test.shape 


y_test.shape 
2475.0
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Out[4]:
(2018,)
In [ ]:
 
In [ ]:
#OR 
In [7]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)

# Load and inspect data
df1 = pd.read_csv("Bengaluru_House_Data.csv")
df1.head()

df1.shape

df1.columns

df1['area_type']

df1['area_type'].unique()

df1['area_type'].value_counts()

# Drop unnecessary columns
df2 = df1.drop(['area_type','society','balcony','availability'], axis='columns')
df2.shape

# Check for null values
df2.isnull().sum()

df2.shape

# Drop rows with null values
df3 = df2.dropna()
df3.isnull().sum()

df3.shape

# Feature engineering
df3['size'].unique()

df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))

df3.head()

df3.bhk.unique()

df3[df3.bhk > 20]

df3.total_sqft.unique()

# Explore total_sqft feature
def is_float(x):
    try:
        float(x)
        return True
    except:
        return False

df3[~df3['total_sqft'].apply(is_float)].head(10)

def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1])) / 2
    try:
        return float(x)
    except:
        return None

convert_sqft_to_num('2100 - 2850')

convert_sqft_to_num('34.46Sq. Meter')

# Apply conversion function
df4 = df3.copy()
df4['total_sqft'] = df4['total_sqft'].apply(convert_sqft_to_num)
df4 = df4[df4.total_sqft.notnull()]

df4.loc[30]

# Adding price per square foot
df5 = df4.copy()
df5['price_per_sqft'] = df5['price'] * 100000 / df5['total_sqft']
df5.head()

# Statistics of price per square foot
df5_stats = df5['price_per_sqft'].describe()
df5_stats

df5.to_csv("bhp.csv", index=False)

# Unique locations count
len(df5.location.unique())

df5.location = df5.location.apply(lambda x: x.strip())
location_stats = df5['location'].value_counts(ascending=False)
location_stats

len(location_stats[location_stats > 10])

len(location_stats)

len(location_stats[location_stats <= 10])

# Dimensionality reduction
location_stats_less_than_10 = location_stats[location_stats <= 10]
location_stats_less_than_10

len(df5.location.unique())

df5.location = df5.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
len(df5.location.unique())

# Outlier Removal Using Business Logic
df5[df5.total_sqft / df5.bhk < 300].head()

df5.shape

df6 = df5[~(df5.total_sqft / df5.bhk < 300)]
df6.shape

df6.columns

# Boxplot for total_sqft
plt.boxplot(df6['total_sqft'])
plt.show()

# IQR method for outlier detection and removal
Q1 = np.percentile(df6['total_sqft'], 25)
Q3 = np.percentile(df6['total_sqft'], 75)
IQR = Q3 - Q1
ll = Q1 - (1.5 * IQR)
ul = Q3 + (1.5 * IQR)
upper_outliers = df6[df6['total_sqft'] > ul].index.tolist()
lower_outliers = df6[df6['total_sqft'] < ll].index.tolist()
bad_indices = list(set(upper_outliers + lower_outliers))
drop = True
if drop:
    df6.drop(bad_indices, inplace=True, errors='ignore')

# Boxplot for bath
plt.boxplot(df6['bath'])
plt.show()

Q1 = np.percentile(df6['bath'], 25)
Q3 = np.percentile(df6['bath'], 75)
IQR = Q3 - Q1
ll = Q1 - (1.5 * IQR)
ul = Q3 + (1.5 * IQR)
upper_outliers = df6[df6['bath'] > ul].index.tolist()
lower_outliers = df6[df6['bath'] < ll].index.tolist()
bad_indices = list(set(upper_outliers + lower_outliers))
drop = True
if drop:
    df6.drop(bad_indices, inplace=True, errors='ignore')

# Boxplot for price
plt.boxplot(df6['price'])
plt.show()

Q1 = np.percentile(df6['price'], 25)
Q3 = np.percentile(df6['price'], 75)
IQR = Q3 - Q1
ll = Q1 - (1.5 * IQR)
ul = Q3 + (1.5 * IQR)
upper_outliers = df6[df6['price'] > ul].index.tolist()
lower_outliers = df6[df6['price'] < ll].index.tolist()
bad_indices = list(set(upper_outliers + lower_outliers))
drop = True
if drop:
    df6.drop(bad_indices, inplace=True, errors='ignore')

# Boxplot for bhk
plt.boxplot(df6['bhk'])
plt.show()

Q1 = np.percentile(df6['bhk'], 25)
Q3 = np.percentile(df6['bhk'], 75)
IQR = Q3 - Q1
ll = Q1 - (1.5 * IQR)
ul = Q3 + (1.5 * IQR)
upper_outliers = df6[df6['bhk'] > ul].index.tolist()
lower_outliers = df6[df6['bhk'] < ll].index.tolist()
bad_indices = list(set(upper_outliers + lower_outliers))
drop = True
if drop:
    df6.drop(bad_indices, inplace=True, errors='ignore')

# Boxplot for price_per_sqft
plt.boxplot(df6['price_per_sqft'])
plt.show()

Q1 = np.percentile(df6['price_per_sqft'], 25)
Q3 = np.percentile(df6['price_per_sqft'], 75)
IQR = Q3 - Q1
ll = Q1 - (1.5 * IQR)
ul = Q3 + (1.5 * IQR)
upper_outliers = df6[df6['price_per_sqft'] > ul].index.tolist()
lower_outliers = df6[df6['price_per_sqft'] < ll].index.tolist()
bad_indices = list(set(upper_outliers + lower_outliers))
drop = True
if drop:
    df6.drop(bad_indices, inplace=True, errors='ignore')

df6.shape

# Splitting data into features and target variable
X = df6.drop(['price'], axis='columns')
X.head(3)

X.shape

y = df6.price
y.head(3)

len(y)

# Splitting dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape

y_train.shape

X_test.shape

y_test.shape
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Out[7]:
(2018,)
In [ ]:
 
In [ ]:
'''pract11 - Data Visualization using matplotlib
Problem Statement: Analyzing Air Quality Index (AQI) Trends in a City
Dataset: "City_Air_Quality.csv"
Description: The dataset contains information about air quality measurements in a specific
city over a period of time. It includes attributes such as date, time, pollutant levels (e.g., PM2.5,
PM10, CO), and the Air Quality Index (AQI) values. The goal is to use the matplotlib library
to create visualizations that effectively represent the AQI trends and patterns for different
pollutants in the city.
Tasks to Perform:
1. Import the "City_Air_Quality.csv" dataset.
2. Explore the dataset to understand its structure and content.
3. Identify the relevant variables for visualizing AQI trends, such as date, pollutant levels,
and AQI values.
4. Create line plots or time series plots to visualize the overall AQI trend over time.
5. Plot individual pollutant levels (e.g., PM2.5, PM10, CO) on separate line plots to
visualize their trends over time.
6. Use bar plots or stacked bar plots to compare the AQI values across different dates or
time periods.
7. Create box plots or violin plots to analyze the distribution of AQI values for different
pollutant categories.
8. Use scatter plots or bubble charts to explore the relationship between AQI values and
pollutant levels.
9. Customize the visualizations by adding labels, titles, legends, and appropriate color
schemes
In [14]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.impute import SimpleImputer
import warnings 

# Suppressing warnings
warnings.filterwarnings('ignore') 

# Load the data
data = pd.read_csv("India Air Quality Data.csv", encoding="cp1252") 
data.info()

# Clean up state name changes
data['state'] = data['state'].replace({'Uttaranchal':'Uttarakhand'}) 
data.loc[data['location'] == "Jamshedpur", 'state'] = 'Jharkhand'

# Uniform format for type column
types = {
    "Residential": "R", 
    "Residential and others": "RO",
    "Residential, Rural and other Areas": "RRO", 
    "Industrial Area": "I",
    "Industrial Areas": "I", 
    "Industrial": "I", 
    "Sensitive Area": "S", 
    "Sensitive Areas": "S", 
    "Sensitive": "S", 
    np.nan: "RRO"
}
data['type'] = data['type'].replace(types) 

# Columns of interest
VALUE_COLS = ['so2', 'no2', 'rspm', 'spm', 'pm2_5']

# Impute missing values with mean for selected columns
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') 
data[VALUE_COLS] = imputer.fit_transform(data[VALUE_COLS])

# Check for any remaining null values
print(data.isnull().sum())

# Plotting top 10 and bottom 10 states by pollution level
def top_and_bottom_10_states(indicator="so2"):
    fig, ax = plt.subplots(2, 1, figsize=(20, 12))
    ind = data[[indicator, 'state']].groupby('state', as_index=False).median().sort_values(by=indicator, ascending=False)
    
    # Top 10 states
    sns.barplot(x='state', y=indicator, data=ind[:10], ax=ax[0], color='red')
    ax[0].set_title(f"Top 10 states by {indicator} (1991-2016)")
    ax[0].set_ylabel(f"{indicator} (µg/m3)")
    ax[0].set_xlabel("State")
    
    # Bottom 10 states
    sns.barplot(x='state', y=indicator, data=ind[-10:], ax=ax[1], color='green')
    ax[1].set_title(f"Bottom 10 states by {indicator} (1991-2016)")
    ax[1].set_ylabel(f"{indicator} (µg/m3)")
    ax[1].set_xlabel("State")

top_and_bottom_10_states("so2") 
top_and_bottom_10_states("no2")

# Plotting the highest recorded levels for each state
def highest_levels_recorded(indicator="so2"): 
    plt.figure(figsize=(20, 10))
    
    # Ensure the indicator column is numeric
    data[indicator] = pd.to_numeric(data[indicator], errors='coerce')
    
    # Drop rows with NaN in the indicator column before aggregation
    filtered_data = data.dropna(subset=[indicator])
    
    # Group by state and find max for the indicator
    ind = filtered_data.groupby('state', as_index=False).agg({indicator: 'max'})
    
    # Plotting
    sns.barplot(x='state', y=indicator, data=ind)
    plt.title(f"Highest ever {indicator} levels recorded by state")
    plt.xticks(rotation=90)

highest_levels_recorded("no2") 
highest_levels_recorded("rspm")

# Plot pollutant averages by type
def type_avg(indicator=""):
    type_avg_data = data[VALUE_COLS + ['type']].groupby("type").mean() 
    if indicator:
        type_avg_data[indicator].plot(kind='bar')
        plt.title(f"Pollutant average by type for {indicator}")
    else:
        type_avg_data.plot(kind='bar')
        plt.title("Pollutant average by type")
    plt.xticks(rotation=0)
    plt.show()

type_avg('so2')

# Plot pollutant averages by location within a state
def location_avgs(state, indicator="so2"):
    locs = data[VALUE_COLS + ['state', 'location']].groupby(['state', 'location']).mean() 
    state_avgs = locs.loc[state].reset_index()
    
    plt.figure(figsize=(15, 8))
    sns.barplot(x='location', y=indicator, data=state_avgs) 
    plt.title(f"Location-wise average for {indicator} in {state}")
    plt.xticks(rotation=90)
    plt.show()

location_avgs("Bihar", "no2")
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435742 entries, 0 to 435741
Data columns (total 13 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   stn_code                     291665 non-null  object 
 1   sampling_date                435739 non-null  object 
 2   state                        435742 non-null  object 
 3   location                     435739 non-null  object 
 4   agency                       286261 non-null  object 
 5   type                         430349 non-null  object 
 6   so2                          401096 non-null  float64
 7   no2                          419509 non-null  float64
 8   rspm                         395520 non-null  float64
 9   spm                          198355 non-null  float64
 10  location_monitoring_station  408251 non-null  object 
 11  pm2_5                        9314 non-null    float64
 12  date                         435735 non-null  object 
dtypes: float64(5), object(8)
memory usage: 43.2+ MB
stn_code                       144077
sampling_date                       3
state                               0
location                            3
agency                         149481
type                                0
so2                                 0
no2                                 0
rspm                                0
spm                                 0
location_monitoring_station     27491
pm2_5                               0
date                                7
dtype: int64
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
 
In [ ]:
#OR
In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
import warnings
%matplotlib inline

# Load the dataset
aqi = pd.read_csv("India Air Quality Data.csv", encoding="ISO-8859-1", parse_dates=['date'])

# Check the first few rows of the dataset
aqi.head()

# Set the style for the plots
sns.set(style="ticks", rc={'figure.figsize': (20, 15)})

# Suppressing update warnings
warnings.filterwarnings('ignore')

# Checking the original dataset for null values and its shape
print(aqi.isnull().sum())
print(aqi.shape)
aqi.info()

# Cleaning up the data
aqi.drop(['stn_code', 'agency', 'sampling_date', 'location_monitoring_station'], axis=1, inplace=True)

# Dropping rows where no date is available
aqi = aqi.dropna(subset=['date'])

# Cleaning up name changes
aqi.state = aqi.state.replace({'Uttaranchal': 'Uttarakhand'})
aqi.loc[aqi.location == "Jamshedpur", 'state'] = 'Jharkhand'

# Changing types to a uniform format
types = {
    "Residential": "R",
    "Residential and others": "RO",
    "Residential, Rural and other Areas": "RRO",
    "Industrial Area": "I",
    "Industrial Areas": "I",
    "Industrial": "I",
    "Sensitive Area": "S",
    "Sensitive Areas": "S",
    "Sensitive": "S",
    np.nan: "RRO"
}
aqi['type'] = aqi['type'].replace(types)

# Display the cleaned dataset
aqi.head()

# Defining columns of importance that will be used regularly
VALUE_COLS = ['so2', 'no2', 'rspm', 'spm', 'pm2_5']

# Invoking SimpleImputer to fill missing values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
aqi[VALUE_COLS] = imputer.fit_transform(aqi[VALUE_COLS])

# Checking to see if the dataset has any null values left
print(aqi.isnull().sum())
aqi.tail()

# Defining a function that plots SO2, NO2, RSPM, and SPM yearly average levels
def plot_for_state(state):
    fig, ax = plt.subplots(2, 2, figsize=(20, 12))
    fig.suptitle(state, size=20)
    
    # Filter data for the specific state and set date as index
    state_data = aqi[aqi.state == state].copy()
    state_data.set_index('date', inplace=True)

    # Resample data by year and calculate mean for each pollutant
    state_data = state_data[VALUE_COLS].resample('Y').mean()

    # Plot each pollutant in a subplot
    state_data['so2'].plot(legend=True, ax=ax[0][0], title="SO2")
    ax[0][0].set_ylabel("SO2 (μg/m3)")
    ax[0][0].set_xlabel("Year")

    state_data['no2'].plot(legend=True, ax=ax[0][1], title="NO2")
    ax[0][1].set_ylabel("NO2 (μg/m3)")
    ax[0][1].set_xlabel("Year")

    state_data['rspm'].plot(legend=True, ax=ax[1][0], title="RSPM")
    ax[1][0].set_ylabel("RSPM (PM10 μg/m3)")
    ax[1][0].set_xlabel("Year")

    state_data['spm'].plot(legend=True, ax=ax[1][1], title="SPM")
    ax[1][1].set_ylabel("SPM (PM10 μg/m3)")
    ax[1][1].set_xlabel("Year")

# Plotting for Uttar Pradesh
plot_for_state("Uttar Pradesh")

# Defining a function to find and plot the top 10 and bottom 10 states for a given indicator
def top_and_bottom_10_states(indicator="so2"):
    fig, ax = plt.subplots(2, 1, figsize=(20, 12))
    ind = aqi[[indicator, 'state']].groupby('state', as_index=False).median()

    top10 = sns.barplot(x='state', y=indicator, data=ind.nlargest(10, indicator), ax=ax[0])
    top10.set_title("Top 10 states by {} (1991-2016)".format(indicator))
    top10.set_ylabel("{} (μg/m3)".format(indicator))
    top10.set_xlabel("State")

    bottom10 = sns.barplot(x='state', y=indicator, data=ind.nsmallest(10, indicator), ax=ax[1])
    bottom10.set_title("Bottom 10 states by {} (1991-2016)".format(indicator))
    bottom10.set_ylabel("{} (μg/m3)".format(indicator))
    bottom10.set_xlabel("State")

# Plotting top and bottom states for SO2 and NO2
top_and_bottom_10_states("so2")
top_and_bottom_10_states("no2")

# Defining a function to find the highest ever recorded levels for a given indicator
def highest_levels_recorded(indicator="so2"):
    plt.figure(figsize=(20, 10))
    ind = aqi[[indicator, 'location', 'state', 'date']].groupby('state', as_index=False).max()
    highest = sns.barplot(x='state', y=indicator, data=ind)
    highest.set_title("Highest ever {} levels recorded by state".format(indicator))
    plt.xticks(rotation=90)

# Plotting highest levels recorded for NO2 and RSPM
highest_levels_recorded("no2")
highest_levels_recorded("rspm")

# Defining a function to plot the yearly trend values for a given indicator
def yearly_trend(state="", indicator="so2"):
    plt.figure(figsize=(20, 12))
    aqi['year'] = aqi.date.dt.year

    if state == "":
        year_wise = aqi[[indicator, 'year', 'state']].groupby('year', as_index=False).mean()
        trend = sns.pointplot(x='year', y=indicator, data=year_wise)
        trend.set_title('Yearly trend of {}'.format(indicator))
    else:
        year_wise = aqi[[indicator, 'year', 'state']].groupby(['state', 'year'], as_index=False).mean()
        trend = sns.pointplot(x='year', y=indicator, data=year_wise)
        trend.set_title('Yearly trend of {} for {}'.format(indicator, state))

# Plotting yearly trends for all states and West Bengal for NO2
yearly_trend()
yearly_trend("West Bengal", "no2")

# Defining a function to plot a heatmap for yearly median average for a given indicator
def indicator_by_state_and_year(indicator="so2"):
    plt.figure(figsize=(20, 20))
    hmap = sns.heatmap(
        data=aqi.pivot_table(values=indicator, index='state', columns='year', aggfunc='mean'),
        annot=True, linewidths=.5, cbar=True, square=True, cmap='inferno'
    )
    hmap.set_title("{} by state and year".format(indicator))

# Plotting heatmap for NO2
indicator_by_state_and_year('no2')

# Defining a function to plot pollutant averages by type for a given indicator
def type_avg(indicator=""):
    type_avg = aqi[VALUE_COLS + ['type', 'date']].groupby("type").mean()
    if indicator != "":
        t = type_avg[indicator].plot(kind='bar')
        plt.xticks(rotation=0)
        plt.title("Pollutant average by type for {}".format(indicator))
    else:
        t = type_avg.plot(kind='bar')
        plt.xticks(rotation=0)
        plt.title("Pollutant average by type")

# Plotting pollutant average by type for SO2
type_avg('so2')

# Defining a function to plot pollutant averages for a given indicator (default SO2)
def location_avgs(state, indicator="so2"):
    locs = aqi[VALUE_COLS + ['state', 'location', 'date']].groupby(['state', 'location']).mean()
    state_avgs = locs.loc[state].reset_index()
    sns.barplot(x='location', y=indicator, data=state_avgs)
    plt.title("Location-wise average for {} in {}".format(indicator, state))
    plt.xticks(rotation=90)

# Plotting location averages for Uttar Pradesh and NO2
location_avgs("Uttar Pradesh", "no2")
stn_code                       144077
sampling_date                       3
state                               0
location                            3
agency                         149481
type                             5393
so2                             34646
no2                             16233
rspm                            40222
spm                            237387
location_monitoring_station     27491
pm2_5                          426428
date                                7
dtype: int64
(435742, 13)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435742 entries, 0 to 435741
Data columns (total 13 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   stn_code                     291665 non-null  object        
 1   sampling_date                435739 non-null  object        
 2   state                        435742 non-null  object        
 3   location                     435739 non-null  object        
 4   agency                       286261 non-null  object        
 5   type                         430349 non-null  object        
 6   so2                          401096 non-null  float64       
 7   no2                          419509 non-null  float64       
 8   rspm                         395520 non-null  float64       
 9   spm                          198355 non-null  float64       
 10  location_monitoring_station  408251 non-null  object        
 11  pm2_5                        9314 non-null    float64       
 12  date                         435735 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(5), object(7)
memory usage: 43.2+ MB
state       0
location    0
type        0
so2         0
no2         0
rspm        0
spm         0
pm2_5       0
date        0
dtype: int64
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\groupby\groupby.py:1874, in GroupBy._agg_py_fallback(self, how, values, ndim, alt)
   1873 try:
-> 1874     res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)
   1875 except Exception as err:

File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\groupby\ops.py:849, in BaseGrouper.agg_series(self, obj, func, preserve_dtype)
    847     preserve_dtype = True
--> 849 result = self._aggregate_series_pure_python(obj, func)
    851 if len(obj) == 0 and len(result) == 0 and isinstance(obj.dtype, ExtensionDtype):

File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\groupby\ops.py:877, in BaseGrouper._aggregate_series_pure_python(self, obj, func)
    876 for i, group in enumerate(splitter):
--> 877     res = func(group)
    878     res = extract_result(res)

File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\groupby\groupby.py:2380, in GroupBy.mean.<locals>.<lambda>(x)
   2377 else:
   2378     result = self._cython_agg_general(
   2379         "mean",
-> 2380         alt=lambda x: Series(x).mean(numeric_only=numeric_only),
   2381         numeric_only=numeric_only,
   2382     )
   2383     return result.__finalize__(self.obj, method="groupby")

File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\series.py:6225, in Series.mean(self, axis, skipna, numeric_only, **kwargs)
   6217 @doc(make_doc("mean", ndim=1))
   6218 def mean(
   6219     self,
   (...)
   6223     **kwargs,
   6224 ):
-> 6225     return NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)

File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\generic.py:11992, in NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)
  11985 def mean(
  11986     self,
  11987     axis: Axis | None = 0,
   (...)
  11990     **kwargs,
  11991 ) -> Series | float:
> 11992     return self._stat_function(
  11993         "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs
  11994     )

File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\generic.py:11949, in NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs)
  11947 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
> 11949 return self._reduce(
  11950     func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
  11951 )

File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\series.py:6133, in Series._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
   6129     raise TypeError(
   6130         f"Series.{name} does not allow {kwd_name}={numeric_only} "
   6131         "with non-numeric dtypes."
   6132     )
-> 6133 return op(delegate, skipna=skipna, **kwds)

File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\nanops.py:147, in bottleneck_switch.__call__.<locals>.f(values, axis, skipna, **kwds)
    146 else:
--> 147     result = alt(values, axis=axis, skipna=skipna, **kwds)
    149 return result

File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\nanops.py:404, in _datetimelike_compat.<locals>.new_func(values, axis, skipna, mask, **kwargs)
    402     mask = isna(values)
--> 404 result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)
    406 if datetimelike:

File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\nanops.py:720, in nanmean(values, axis, skipna, mask)
    719 the_sum = values.sum(axis, dtype=dtype_sum)
--> 720 the_sum = _ensure_numeric(the_sum)
    722 if axis is not None and getattr(the_sum, "ndim", False):

File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\nanops.py:1693, in _ensure_numeric(x)
   1691 if isinstance(x, str):
   1692     # GH#44008, GH#36703 avoid casting e.g. strings to numeric
-> 1693     raise TypeError(f"Could not convert string '{x}' to numeric")
   1694 try:

TypeError: Could not convert string 'BiharBiharBiharBiharBiharBiharBiharBiharBiharBiharBiharBiharBiharBiharBiharBiharBiharBiharDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiDelhiGoaGoaGoaGoaGoaGoaGoaGoaGoaGoaGoaGoaGoaGoaGoaGoaGoaGoaGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratGujaratHaryanaHaryanaHaryanaHaryanaHaryanaHaryanaHaryanaHaryanaHaryanaHaryanaHaryanaHimachal PradeshHimachal PradeshHimachal PradeshHimachal PradeshHimachal PradeshHimachal PradeshHimachal PradeshHimachal PradeshKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKarnatakaKeralaMaharashtraMaharashtraMaharashtraMaharashtraMaharashtraMaharashtraMaharashtraMaharashtraMaharashtraOdishaOdishaOdishaPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabPunjabRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanRajasthanTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduTamil NaduUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshUttar PradeshWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest BengalWest Bengal' to numeric

The above exception was the direct cause of the following exception:

TypeError                                 Traceback (most recent call last)
Cell In[18], line 143
    140         trend.set_title('Yearly trend of {} for {}'.format(indicator, state))
    142 # Plotting yearly trends for all states and West Bengal for NO2
--> 143 yearly_trend()
    144 yearly_trend("West Bengal", "no2")
    146 # Defining a function to plot a heatmap for yearly median average for a given indicator

Cell In[18], line 134, in yearly_trend(state, indicator)
    131 aqi['year'] = aqi.date.dt.year
    133 if state == "":
--> 134     year_wise = aqi[[indicator, 'year', 'state']].groupby('year', as_index=False).mean()
    135     trend = sns.pointplot(x='year', y=indicator, data=year_wise)
    136     trend.set_title('Yearly trend of {}'.format(indicator))

File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\groupby\groupby.py:2378, in GroupBy.mean(self, numeric_only, engine, engine_kwargs)
   2371     return self._numba_agg_general(
   2372         grouped_mean,
   2373         executor.float_dtype_mapping,
   2374         engine_kwargs,
   2375         min_periods=0,
   2376     )
   2377 else:
-> 2378     result = self._cython_agg_general(
   2379         "mean",
   2380         alt=lambda x: Series(x).mean(numeric_only=numeric_only),
   2381         numeric_only=numeric_only,
   2382     )
   2383     return result.__finalize__(self.obj, method="groupby")

File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\groupby\groupby.py:1929, in GroupBy._cython_agg_general(self, how, alt, numeric_only, min_count, **kwargs)
   1926     result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)
   1927     return result
-> 1929 new_mgr = data.grouped_reduce(array_func)
   1930 res = self._wrap_agged_manager(new_mgr)
   1931 out = self._wrap_aggregated_output(res)

File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\internals\managers.py:1428, in BlockManager.grouped_reduce(self, func)
   1424 if blk.is_object:
   1425     # split on object-dtype blocks bc some columns may raise
   1426     #  while others do not.
   1427     for sb in blk._split():
-> 1428         applied = sb.apply(func)
   1429         result_blocks = extend_blocks(applied, result_blocks)
   1430 else:

File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\internals\blocks.py:366, in Block.apply(self, func, **kwargs)
    360 @final
    361 def apply(self, func, **kwargs) -> list[Block]:
    362     """
    363     apply the function to my values; return a block if we are not
    364     one
    365     """
--> 366     result = func(self.values, **kwargs)
    368     result = maybe_coerce_values(result)
    369     return self._split_op_result(result)

File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\groupby\groupby.py:1926, in GroupBy._cython_agg_general.<locals>.array_func(values)
   1923 else:
   1924     return result
-> 1926 result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)
   1927 return result

File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\groupby\groupby.py:1878, in GroupBy._agg_py_fallback(self, how, values, ndim, alt)
   1876     msg = f"agg function failed [how->{how},dtype->{ser.dtype}]"
   1877     # preserve the kind of exception that raised
-> 1878     raise type(err)(msg) from err
   1880 if ser.dtype == object:
   1881     res_values = res_values.astype(object, copy=False)

TypeError: agg function failed [how->mean,dtype->object]
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
<Figure size 2000x1200 with 0 Axes>
In [ ]:
 
In [ ]:
''' Pract 12 - 
Data Aggregation
Problem Statement: Analyzing Sales Performance by Region in a Retail Company
Dataset: "Retail_Sales_Data.csv"
Description: The dataset contains information about sales transactions in a retail company. It
includes attributes such as transaction date, product category, quantity sold, and sales
amount. The goal is to perform data aggregation to analyze the sales performance by region
and identify the top-performing regions.
Tasks to Perform:
1. Import the "Retail_Sales_Data.csv" dataset.
2. Explore the dataset to understand its structure and content.
3. Identify the relevant variables for aggregating sales data, such as region, sales
amount, and product category.
4. Group the sales data by region and calculate the total sales amount for each region.
5. Create bar plots or pie charts to visualize the sales distribution by region.
6. Identify the top-performing regions based on the highest sales amount.
7. Group the sales data by region and product category to calculate the total sales
amount for each combination.
8. Create stacked bar plots or grouped bar plots to compare the sales amounts across
different regions and product categories.
In [19]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("customer_shopping_data.csv") 
df.head()

df.tail()

# To check the count of records grouped by region/branch of the mall
df.groupby("shopping_mall").count()


# To check the count of records grouped by the product categories
df.groupby("category").count()

# total sales for each mall branch
branch_sales = df.groupby("shopping_mall").sum()
branch_sales

# total sales for each category of product
category_sales = df.groupby("category").sum()
category_sales

# to get the top performing branches
branch_sales.sort_values(by = "price", ascending = False)

# to get the top selling categories
category_sales.sort_values(by = "price", ascending = False)

# to get total sales for each combination of branch and product_category
combined_branch_category_sales = df.groupby(["shopping_mall", "category"]).sum()
combined_branch_category_sales

# pie chart for sales by branch
plt.pie(branch_sales["price"], labels = branch_sales.index) 
plt.show()

# pie chart for sales by product category
plt.pie(category_sales["price"], labels = category_sales.index) 
plt.show()

combined_pivot = df.pivot_table(index="shopping_mall", columns="category", values="price", aggfunc="sum") 

# grouped bar chart for sales of different categories at different branches
combined_pivot.plot(kind="bar", figsize=(10, 6)) 
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]: